2018-07-26 04:00:37 +08:00
|
|
|
// This file is part of OpenCV project.
|
|
|
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
|
|
// of this distribution and at http://opencv.org/license.html
|
|
|
|
|
|
|
|
#include "opencv2/core/hal/intrin.hpp"
|
|
|
|
|
|
|
|
//=========================================
|
|
|
|
// Declare & Define & Dispatch in one step
|
|
|
|
//=========================================
|
|
|
|
|
|
|
|
// ARITHM_DISPATCHING_ONLY defined by arithm dispatch file
|
|
|
|
|
|
|
|
#undef ARITHM_DECLARATIONS_ONLY
|
|
|
|
#ifdef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
|
|
|
#define ARITHM_DECLARATIONS_ONLY
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#undef ARITHM_DEFINITIONS_ONLY
|
|
|
|
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && !defined(ARITHM_DISPATCHING_ONLY)
|
|
|
|
#define ARITHM_DEFINITIONS_ONLY
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef ARITHM_DECLARATIONS_ONLY
|
|
|
|
#undef DEFINE_SIMD
|
|
|
|
#define DEFINE_SIMD(fun_name, c_type, ...) \
|
|
|
|
DECLARE_SIMD_FUN(fun_name, c_type)
|
|
|
|
#endif // ARITHM_DECLARATIONS_ONLY
|
|
|
|
|
|
|
|
#ifdef ARITHM_DEFINITIONS_ONLY
|
|
|
|
#undef DEFINE_SIMD
|
|
|
|
#define DEFINE_SIMD(fun_name, c_type, v_type, ...) \
|
|
|
|
DECLARE_SIMD_FUN(fun_name, c_type) \
|
|
|
|
DEFINE_SIMD_FUN(fun_name, c_type, v_type, __VA_ARGS__)
|
|
|
|
#endif // ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
#ifdef ARITHM_DISPATCHING_ONLY
|
|
|
|
#undef DEFINE_SIMD
|
|
|
|
#define DEFINE_SIMD(fun_name, c_type, v_type, ...) \
|
|
|
|
DISPATCH_SIMD_FUN(fun_name, c_type, v_type, __VA_ARGS__)
|
|
|
|
#endif // ARITHM_DISPATCHING_ONLY
|
|
|
|
|
|
|
|
// workaround when neon miss support of double precision
|
|
|
|
#undef DEFINE_NOSIMD
|
|
|
|
#ifdef ARITHM_DEFINITIONS_ONLY
|
|
|
|
#define DEFINE_NOSIMD(fun_name, c_type, ...) \
|
|
|
|
DECLARE_SIMD_FUN(fun_name, c_type) \
|
|
|
|
DEFINE_NOSIMD_FUN(fun_name, c_type, __VA_ARGS__)
|
|
|
|
#else
|
|
|
|
#define DEFINE_NOSIMD DEFINE_SIMD
|
|
|
|
#endif // ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
#ifndef SIMD_GUARD
|
|
|
|
|
|
|
|
#define DEFINE_SIMD_U8(fun, ...) \
|
|
|
|
DEFINE_SIMD(__CV_CAT(fun, 8u), uchar, v_uint8, __VA_ARGS__)
|
|
|
|
|
|
|
|
#define DEFINE_SIMD_S8(fun, ...) \
|
|
|
|
DEFINE_SIMD(__CV_CAT(fun, 8s), schar, v_int8, __VA_ARGS__)
|
|
|
|
|
|
|
|
#define DEFINE_SIMD_U16(fun, ...) \
|
|
|
|
DEFINE_SIMD(__CV_CAT(fun, 16u), ushort, v_uint16, __VA_ARGS__)
|
|
|
|
|
|
|
|
#define DEFINE_SIMD_S16(fun, ...) \
|
|
|
|
DEFINE_SIMD(__CV_CAT(fun, 16s), short, v_int16, __VA_ARGS__)
|
|
|
|
|
|
|
|
#define DEFINE_SIMD_S32(fun, ...) \
|
|
|
|
DEFINE_SIMD(__CV_CAT(fun, 32s), int, v_int32, __VA_ARGS__)
|
|
|
|
|
|
|
|
#define DEFINE_SIMD_F32(fun, ...) \
|
|
|
|
DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__)
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
#define DEFINE_SIMD_F64(fun, ...) \
|
|
|
|
DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__)
|
|
|
|
#else
|
|
|
|
#define DEFINE_SIMD_F64(fun, ...) \
|
|
|
|
DEFINE_NOSIMD(__CV_CAT(fun, 64f), double, __VA_ARGS__)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define DEFINE_SIMD_SAT(fun, ...) \
|
|
|
|
DEFINE_SIMD_U8(fun, __VA_ARGS__) \
|
|
|
|
DEFINE_SIMD_S8(fun, __VA_ARGS__) \
|
|
|
|
DEFINE_SIMD_U16(fun, __VA_ARGS__) \
|
|
|
|
DEFINE_SIMD_S16(fun, __VA_ARGS__)
|
|
|
|
|
|
|
|
#define DEFINE_SIMD_NSAT(fun, ...) \
|
|
|
|
DEFINE_SIMD_S32(fun, __VA_ARGS__) \
|
|
|
|
DEFINE_SIMD_F32(fun, __VA_ARGS__) \
|
|
|
|
DEFINE_SIMD_F64(fun, __VA_ARGS__)
|
|
|
|
|
|
|
|
#define DEFINE_SIMD_ALL(fun, ...) \
|
|
|
|
DEFINE_SIMD_SAT(fun, __VA_ARGS__) \
|
|
|
|
DEFINE_SIMD_NSAT(fun, __VA_ARGS__)
|
|
|
|
|
|
|
|
#endif // SIMD_GUARD
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
namespace cv { namespace hal {
|
|
|
|
|
|
|
|
#ifndef ARITHM_DISPATCHING_ONLY
|
|
|
|
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
typedef int v_float64; // dummy
|
|
|
|
#endif
|
|
|
|
|
|
|
|
//=======================================
|
|
|
|
// Utility
|
|
|
|
//=======================================
|
|
|
|
|
|
|
|
/** add **/
|
|
|
|
template<typename T>
|
|
|
|
static inline T c_add(T a, T b)
|
|
|
|
{ return saturate_cast<T>(a + b); }
|
|
|
|
template<>
|
|
|
|
inline uchar c_add<uchar>(uchar a, uchar b)
|
|
|
|
{ return CV_FAST_CAST_8U(a + b); }
|
|
|
|
// scale
|
|
|
|
template<typename T1, typename T2>
|
|
|
|
static inline T1 c_add(T1 a, T1 b, T2 scalar)
|
|
|
|
{ return saturate_cast<T1>((T2)a * scalar + b); }
|
|
|
|
template<>
|
|
|
|
inline uchar c_add<uchar, float>(uchar a, uchar b, float scalar)
|
|
|
|
{ return saturate_cast<uchar>(CV_8TO32F(a) * scalar + b); }
|
|
|
|
// weight
|
|
|
|
template<typename T1, typename T2>
|
|
|
|
static inline T1 c_add(T1 a, T1 b, T2 alpha, T2 beta, T2 gamma)
|
|
|
|
{ return saturate_cast<T1>(a * alpha + b * beta + gamma); }
|
|
|
|
template<>
|
|
|
|
inline uchar c_add<uchar, float>(uchar a, uchar b, float alpha, float beta, float gamma)
|
|
|
|
{ return saturate_cast<uchar>(CV_8TO32F(a) * alpha + CV_8TO32F(b) * beta + gamma); }
|
|
|
|
|
|
|
|
/** sub **/
|
|
|
|
template<typename T>
|
|
|
|
static inline T c_sub(T a, T b)
|
|
|
|
{ return saturate_cast<T>(a - b); }
|
|
|
|
template<>
|
|
|
|
inline uchar c_sub<uchar>(uchar a, uchar b)
|
|
|
|
{ return CV_FAST_CAST_8U(a - b); }
|
|
|
|
|
|
|
|
/** max **/
|
|
|
|
template<typename T>
|
|
|
|
static inline T c_max(T a, T b)
|
|
|
|
{ return std::max(a, b); }
|
|
|
|
template<>
|
|
|
|
inline uchar c_max<uchar>(uchar a, uchar b)
|
|
|
|
{ return CV_MAX_8U(a, b); }
|
|
|
|
|
|
|
|
/** min **/
|
|
|
|
template<typename T>
|
|
|
|
static inline T c_min(T a, T b)
|
|
|
|
{ return std::min(a, b); }
|
|
|
|
template<>
|
|
|
|
inline uchar c_min<uchar>(uchar a, uchar b)
|
|
|
|
{ return CV_MIN_8U(a, b); }
|
|
|
|
|
|
|
|
/** absdiff **/
|
|
|
|
template<typename T>
|
|
|
|
static inline T c_absdiff(T a, T b)
|
|
|
|
{ return a > b ? a - b : b - a; }
|
|
|
|
template<>
|
|
|
|
inline schar c_absdiff(schar a, schar b)
|
|
|
|
{ return saturate_cast<schar>(std::abs(a - b)); }
|
|
|
|
template<>
|
|
|
|
inline short c_absdiff(short a, short b)
|
|
|
|
{ return saturate_cast<short>(std::abs(a - b)); }
|
|
|
|
// specializations to prevent "-0" results
|
|
|
|
template<>
|
|
|
|
inline float c_absdiff<float>(float a, float b)
|
|
|
|
{ return std::abs(a - b); }
|
|
|
|
template<>
|
|
|
|
inline double c_absdiff<double>(double a, double b)
|
|
|
|
{ return std::abs(a - b); }
|
|
|
|
|
|
|
|
/** multiply **/
|
|
|
|
template<typename T>
|
|
|
|
static inline T c_mul(T a, T b)
|
|
|
|
{ return saturate_cast<T>(a * b); }
|
|
|
|
template<>
|
|
|
|
inline uchar c_mul<uchar>(uchar a, uchar b)
|
|
|
|
{ return CV_FAST_CAST_8U(a * b); }
|
|
|
|
// scale
|
|
|
|
template<typename T1, typename T2>
|
|
|
|
static inline T1 c_mul(T1 a, T1 b, T2 scalar)
|
|
|
|
{ return saturate_cast<T1>(scalar * (T2)a * b); }
|
|
|
|
template<>
|
|
|
|
inline uchar c_mul<uchar, float>(uchar a, uchar b, float scalar)
|
|
|
|
{ return saturate_cast<uchar>(scalar * CV_8TO32F(a) * CV_8TO32F(b)); }
|
|
|
|
|
|
|
|
/** divide & reciprocal **/
|
|
|
|
template<typename T1, typename T2>
|
|
|
|
static inline T2 c_div(T1 a, T2 b)
|
|
|
|
{ return saturate_cast<T2>(a / b); }
|
|
|
|
// recip
|
|
|
|
template<>
|
|
|
|
inline uchar c_div<float, uchar>(float a, uchar b)
|
|
|
|
{ return saturate_cast<uchar>(a / CV_8TO32F(b)); }
|
|
|
|
// scale
|
|
|
|
template<typename T1, typename T2>
|
|
|
|
static inline T1 c_div(T1 a, T1 b, T2 scalar)
|
|
|
|
{ return saturate_cast<T1>(scalar * (T2)a / b); }
|
|
|
|
template<>
|
|
|
|
inline uchar c_div<uchar, float>(uchar a, uchar b, float scalar)
|
|
|
|
{ return saturate_cast<uchar>(scalar * CV_8TO32F(a) / CV_8TO32F(b)); }
|
|
|
|
|
|
|
|
//=======================================
|
|
|
|
// Arithmetic and logical operations
|
|
|
|
// +, -, *, /, &, |, ^, ~, abs ...
|
|
|
|
//=======================================
|
|
|
|
|
|
|
|
///////////////////////////// Operations //////////////////////////////////
|
|
|
|
|
|
|
|
// Add
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_add
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
{ return v_add(a, b); }
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 a, T1 b)
|
|
|
|
{ return c_add(a, b); }
|
|
|
|
};
|
|
|
|
|
|
|
|
// Subtract
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_sub
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
{ return v_sub(a, b); }
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 a, T1 b)
|
|
|
|
{ return c_sub(a, b); }
|
|
|
|
};
|
|
|
|
|
|
|
|
// Max & Min
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_max
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
|
|
|
{ return v_max(a, b); }
|
|
|
|
static inline T1 r(T1 a, T1 b)
|
|
|
|
{ return c_max(a, b); }
|
|
|
|
};
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_min
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
|
|
|
{ return v_min(a, b); }
|
|
|
|
static inline T1 r(T1 a, T1 b)
|
|
|
|
{ return c_min(a, b); }
|
|
|
|
};
|
|
|
|
|
|
|
|
// Absolute difference
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_absdiff
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
|
|
|
{ return v_absdiff(a, b); }
|
|
|
|
static inline T1 r(T1 a, T1 b)
|
|
|
|
{ return c_absdiff(a, b); }
|
|
|
|
};
|
|
|
|
// Signed absolute difference, 's'
|
|
|
|
template<>
|
|
|
|
struct op_absdiff<schar, v_int8>
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_int8 r(const v_int8& a, const v_int8& b)
|
|
|
|
{ return v_absdiffs(a, b); }
|
2022-07-20 01:02:00 +08:00
|
|
|
#endif
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline schar r(schar a, schar b)
|
|
|
|
{ return c_absdiff(a, b); }
|
|
|
|
};
|
|
|
|
template<>
|
|
|
|
struct op_absdiff<short, v_int16>
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_int16 r(const v_int16& a, const v_int16& b)
|
|
|
|
{ return v_absdiffs(a, b); }
|
2022-07-20 01:02:00 +08:00
|
|
|
#endif
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline short r(short a, short b)
|
|
|
|
{ return c_absdiff(a, b); }
|
|
|
|
};
|
|
|
|
template<>
|
|
|
|
struct op_absdiff<int, v_int32>
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_int32 r(const v_int32& a, const v_int32& b)
|
|
|
|
{ return v_reinterpret_as_s32(v_absdiff(a, b)); }
|
2022-07-20 01:02:00 +08:00
|
|
|
#endif
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline int r(int a, int b)
|
|
|
|
{ return c_absdiff(a, b); }
|
|
|
|
};
|
|
|
|
|
|
|
|
// Logical
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_or
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
{ return v_or(a, b); }
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 a, T1 b)
|
|
|
|
{ return a | b; }
|
|
|
|
};
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_xor
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
{ return v_xor(a, b); }
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 a, T1 b)
|
|
|
|
{ return a ^ b; }
|
|
|
|
};
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_and
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
{ return v_and(a, b); }
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 a, T1 b)
|
|
|
|
{ return a & b; }
|
|
|
|
};
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_not
|
|
|
|
{
|
|
|
|
// ignored b from loader level
|
|
|
|
static inline Tvec r(const Tvec& a)
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
{ return v_not(a); }
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 a, T1)
|
|
|
|
{ return ~a; }
|
|
|
|
};
|
|
|
|
|
|
|
|
//////////////////////////// Loaders /////////////////////////////////
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
|
|
|
|
template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
|
|
|
|
struct bin_loader
|
|
|
|
{
|
|
|
|
typedef OP<T1, Tvec> op;
|
|
|
|
|
|
|
|
static inline void l(const T1* src1, const T1* src2, T1* dst)
|
|
|
|
{
|
|
|
|
Tvec a = vx_load(src1);
|
|
|
|
Tvec b = vx_load(src2);
|
|
|
|
v_store(dst, op::r(a, b));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void la(const T1* src1, const T1* src2, T1* dst)
|
|
|
|
{
|
|
|
|
Tvec a = vx_load_aligned(src1);
|
|
|
|
Tvec b = vx_load_aligned(src2);
|
|
|
|
v_store_aligned(dst, op::r(a, b)); // todo: try write without cache
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void l64(const T1* src1, const T1* src2, T1* dst)
|
|
|
|
{
|
|
|
|
Tvec a = vx_load_low(src1), b = vx_load_low(src2);
|
|
|
|
v_store_low(dst, op::r(a, b));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// void src2 for operation "not"
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct bin_loader<op_not, T1, Tvec>
|
|
|
|
{
|
|
|
|
typedef op_not<T1, Tvec> op;
|
|
|
|
|
|
|
|
static inline void l(const T1* src1, const T1*, T1* dst)
|
|
|
|
{
|
|
|
|
Tvec a = vx_load(src1);
|
|
|
|
v_store(dst, op::r(a));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void la(const T1* src1, const T1*, T1* dst)
|
|
|
|
{
|
|
|
|
Tvec a = vx_load_aligned(src1);
|
|
|
|
v_store_aligned(dst, op::r(a));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void l64(const T1* src1, const T1*, T1* dst)
|
|
|
|
{
|
|
|
|
Tvec a = vx_load_low(src1);
|
|
|
|
v_store_low(dst, op::r(a));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
#endif // CV_SIMD
|
|
|
|
|
|
|
|
//////////////////////////// Loops /////////////////////////////////
|
|
|
|
|
|
|
|
template<typename T1, typename T2>
|
|
|
|
static inline bool is_aligned(const T1* src1, const T1* src2, const T2* dst)
|
|
|
|
{ return (((size_t)src1|(size_t)src2|(size_t)dst) & (CV_SIMD_WIDTH - 1)) == 0; }
|
|
|
|
|
|
|
|
template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
|
|
|
|
static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
|
|
|
|
{
|
|
|
|
typedef OP<T1, Tvec> op;
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
typedef bin_loader<OP, T1, Tvec> ldr;
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int wide_step = VTraits<Tvec>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
#if !CV_NEON && CV_SIMD_WIDTH == 16
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int wide_step_l = wide_step * 2;
|
2018-07-26 04:00:37 +08:00
|
|
|
#else
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int wide_step_l = wide_step;
|
2018-07-26 04:00:37 +08:00
|
|
|
#endif
|
|
|
|
#endif // CV_SIMD
|
|
|
|
|
|
|
|
step1 /= sizeof(T1);
|
|
|
|
step2 /= sizeof(T1);
|
|
|
|
step /= sizeof(T1);
|
|
|
|
|
|
|
|
for (; height--; src1 += step1, src2 += step2, dst += step)
|
|
|
|
{
|
|
|
|
int x = 0;
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2019-09-21 00:52:48 +08:00
|
|
|
#if !CV_NEON && !CV_MSA
|
2018-07-26 04:00:37 +08:00
|
|
|
if (is_aligned(src1, src2, dst))
|
|
|
|
{
|
|
|
|
for (; x <= width - wide_step_l; x += wide_step_l)
|
|
|
|
{
|
|
|
|
ldr::la(src1 + x, src2 + x, dst + x);
|
2019-09-21 00:52:48 +08:00
|
|
|
#if CV_SIMD_WIDTH == 16
|
2018-07-26 04:00:37 +08:00
|
|
|
ldr::la(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
for (; x <= width - wide_step_l; x += wide_step_l)
|
|
|
|
{
|
|
|
|
ldr::l(src1 + x, src2 + x, dst + x);
|
|
|
|
#if !CV_NEON && CV_SIMD_WIDTH == 16
|
|
|
|
ldr::l(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
#if CV_SIMD_WIDTH == 16
|
|
|
|
for (; x <= width - 8/(int)sizeof(T1); x += 8/(int)sizeof(T1))
|
|
|
|
{
|
|
|
|
ldr::l64(src1 + x, src2 + x, dst + x);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif // CV_SIMD
|
|
|
|
|
|
|
|
#if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
|
|
|
|
for (; x <= width - 4; x += 4)
|
|
|
|
{
|
|
|
|
T1 t0 = op::r(src1[x], src2[x]);
|
|
|
|
T1 t1 = op::r(src1[x + 1], src2[x + 1]);
|
|
|
|
dst[x] = t0; dst[x + 1] = t1;
|
|
|
|
|
|
|
|
t0 = op::r(src1[x + 2], src2[x + 2]);
|
|
|
|
t1 = op::r(src1[x + 3], src2[x + 3]);
|
|
|
|
dst[x + 2] = t0; dst[x + 3] = t1;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
for (; x < width; x++)
|
|
|
|
dst[x] = op::r(src1[x], src2[x]);
|
|
|
|
}
|
|
|
|
|
|
|
|
vx_cleanup();
|
|
|
|
}
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
|
|
|
|
static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
|
|
|
|
{
|
|
|
|
typedef OP<T1, Tvec/*dummy*/> op;
|
|
|
|
|
|
|
|
step1 /= sizeof(T1);
|
|
|
|
step2 /= sizeof(T1);
|
|
|
|
step /= sizeof(T1);
|
|
|
|
|
|
|
|
for (; height--; src1 += step1, src2 += step2, dst += step)
|
|
|
|
{
|
|
|
|
int x = 0;
|
|
|
|
|
|
|
|
for (; x <= width - 4; x += 4)
|
|
|
|
{
|
|
|
|
T1 t0 = op::r(src1[x], src2[x]);
|
|
|
|
T1 t1 = op::r(src1[x + 1], src2[x + 1]);
|
|
|
|
dst[x] = t0; dst[x + 1] = t1;
|
|
|
|
|
|
|
|
t0 = op::r(src1[x + 2], src2[x + 2]);
|
|
|
|
t1 = op::r(src1[x + 3], src2[x + 3]);
|
|
|
|
dst[x + 2] = t0; dst[x + 3] = t1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (; x < width; x++)
|
|
|
|
dst[x] = op::r(src1[x], src2[x]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#define BIN_LOOP64F bin_loop_nosimd
|
|
|
|
#else
|
|
|
|
#define BIN_LOOP64F bin_loop
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#endif //!(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
|
|
|
|
#endif // ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#ifndef SIMD_GUARD
|
|
|
|
#define BIN_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \
|
|
|
|
_T1* dst, size_t step, int width, int height
|
|
|
|
|
|
|
|
#define BIN_ARGS_PASS src1, step1, src2, step2, dst, step, width, height
|
|
|
|
#endif // SIMD_GUARD
|
|
|
|
|
|
|
|
#undef DECLARE_SIMD_FUN
|
|
|
|
#define DECLARE_SIMD_FUN(fun, _T1) void fun(BIN_ARGS(_T1));
|
|
|
|
|
|
|
|
#undef DISPATCH_SIMD_FUN
|
|
|
|
#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, _OP) \
|
|
|
|
void fun(BIN_ARGS(_T1), void*) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
CALL_HAL(fun, __CV_CAT(cv_hal_, fun), BIN_ARGS_PASS) \
|
|
|
|
ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), BIN_ARGS_PASS) \
|
|
|
|
CV_CPU_DISPATCH(fun, (BIN_ARGS_PASS), CV_CPU_DISPATCH_MODES_ALL); \
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef DEFINE_SIMD_FUN
|
|
|
|
#define DEFINE_SIMD_FUN(fun, _T1, _Tvec, _OP) \
|
|
|
|
void fun(BIN_ARGS(_T1)) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
bin_loop<_OP, _T1, _Tvec>(BIN_ARGS_PASS); \
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef DEFINE_NOSIMD_FUN
|
|
|
|
#define DEFINE_NOSIMD_FUN(fun, _T1, _OP) \
|
|
|
|
void fun(BIN_ARGS(_T1)) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
bin_loop_nosimd<_OP, _T1, v_float64>(BIN_ARGS_PASS); \
|
|
|
|
}
|
|
|
|
|
|
|
|
DEFINE_SIMD_ALL(add, op_add)
|
|
|
|
DEFINE_SIMD_ALL(sub, op_sub)
|
|
|
|
|
|
|
|
DEFINE_SIMD_ALL(min, op_min)
|
|
|
|
DEFINE_SIMD_ALL(max, op_max)
|
|
|
|
|
|
|
|
DEFINE_SIMD_ALL(absdiff, op_absdiff)
|
|
|
|
|
|
|
|
DEFINE_SIMD_U8(or, op_or)
|
|
|
|
DEFINE_SIMD_U8(xor, op_xor)
|
|
|
|
DEFINE_SIMD_U8(and, op_and)
|
|
|
|
|
|
|
|
// One source!, an exception for operation "not"
|
|
|
|
// we could use macros here but it's better to implement it
|
|
|
|
// with that way to give more clarification
|
|
|
|
// about how macroS "DEFINE_SIMD_*" are works
|
|
|
|
|
|
|
|
#if defined(ARITHM_DECLARATIONS_ONLY) || defined(ARITHM_DEFINITIONS_ONLY)
|
|
|
|
void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
|
|
|
|
#endif
|
|
|
|
#ifdef ARITHM_DEFINITIONS_ONLY
|
|
|
|
void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
|
|
|
|
{
|
|
|
|
CV_INSTRUMENT_REGION();
|
|
|
|
bin_loop<op_not, uchar, v_uint8>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef ARITHM_DISPATCHING_ONLY
|
|
|
|
void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void*)
|
|
|
|
{
|
|
|
|
CV_INSTRUMENT_REGION();
|
|
|
|
CALL_HAL(not8u, cv_hal_not8u, src1, step1, dst, step, width, height)
|
|
|
|
ARITHM_CALL_IPP(arithm_ipp_not8u, src1, step1, dst, step, width, height)
|
|
|
|
CV_CPU_DISPATCH(not8u, (src1, step1, src2, step2, dst, step, width, height), CV_CPU_DISPATCH_MODES_ALL);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
//=======================================
|
|
|
|
// Compare
|
|
|
|
//=======================================
|
|
|
|
|
|
|
|
#ifdef ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
///////////////////////////// Operations //////////////////////////////////
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_cmplt
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
{ return v_lt(a, b); }
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline uchar r(T1 a, T1 b)
|
|
|
|
{ return (uchar)-(int)(a < b); }
|
|
|
|
};
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_cmple
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
{ return v_le(a, b); }
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline uchar r(T1 a, T1 b)
|
|
|
|
{ return (uchar)-(int)(a <= b); }
|
|
|
|
};
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_cmpeq
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
{ return v_eq(a, b); }
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline uchar r(T1 a, T1 b)
|
|
|
|
{ return (uchar)-(int)(a == b); }
|
|
|
|
};
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_cmpne
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
{ return v_ne(a, b); }
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline uchar r(T1 a, T1 b)
|
|
|
|
{ return (uchar)-(int)(a != b); }
|
|
|
|
};
|
|
|
|
|
|
|
|
//////////////////////////// Loaders /////////////////////////////////
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
// todo: add support for RW alignment & stream
|
|
|
|
template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
|
|
|
|
struct cmp_loader_n
|
|
|
|
{
|
|
|
|
void l(const T1* src1, const T1* src2, uchar* dst);
|
|
|
|
};
|
|
|
|
|
|
|
|
template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
|
|
|
|
struct cmp_loader_n<sizeof(uchar), OP, T1, Tvec>
|
|
|
|
{
|
|
|
|
typedef OP<T1, Tvec> op;
|
|
|
|
|
|
|
|
static inline void l(const T1* src1, const T1* src2, uchar* dst)
|
|
|
|
{
|
|
|
|
Tvec a = vx_load(src1);
|
|
|
|
Tvec b = vx_load(src2);
|
|
|
|
v_store(dst, v_reinterpret_as_u8(op::r(a, b)));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
|
|
|
|
struct cmp_loader_n<sizeof(ushort), OP, T1, Tvec>
|
|
|
|
{
|
|
|
|
typedef OP<T1, Tvec> op;
|
|
|
|
|
|
|
|
static inline void l(const T1* src1, const T1* src2, uchar* dst)
|
|
|
|
{
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int step = VTraits<Tvec>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
Tvec c0 = op::r(vx_load(src1), vx_load(src2));
|
|
|
|
Tvec c1 = op::r(vx_load(src1 + step), vx_load(src2 + step));
|
|
|
|
v_store(dst, v_pack_b(v_reinterpret_as_u16(c0), v_reinterpret_as_u16(c1)));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
|
|
|
|
struct cmp_loader_n<sizeof(unsigned), OP, T1, Tvec>
|
|
|
|
{
|
|
|
|
typedef OP<T1, Tvec> op;
|
|
|
|
|
|
|
|
static inline void l(const T1* src1, const T1* src2, uchar* dst)
|
|
|
|
{
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int step = VTraits<Tvec>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
v_uint32 c0 = v_reinterpret_as_u32(op::r(vx_load(src1), vx_load(src2)));
|
|
|
|
v_uint32 c1 = v_reinterpret_as_u32(op::r(vx_load(src1 + step), vx_load(src2 + step)));
|
|
|
|
v_uint32 c2 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
|
|
|
|
v_uint32 c3 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 3), vx_load(src2 + step * 3)));
|
|
|
|
v_store(dst, v_pack_b(c0, c1, c2, c3));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
|
|
|
|
struct cmp_loader_n<sizeof(double), OP, T1, Tvec>
|
|
|
|
{
|
|
|
|
typedef OP<T1, Tvec> op;
|
|
|
|
|
|
|
|
static inline void l(const T1* src1, const T1* src2, uchar* dst)
|
|
|
|
{
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int step = VTraits<Tvec>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
v_uint64 c0 = v_reinterpret_as_u64(op::r(vx_load(src1), vx_load(src2)));
|
|
|
|
v_uint64 c1 = v_reinterpret_as_u64(op::r(vx_load(src1 + step), vx_load(src2 + step)));
|
|
|
|
v_uint64 c2 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
|
|
|
|
v_uint64 c3 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 3), vx_load(src2 + step * 3)));
|
|
|
|
|
|
|
|
v_uint64 c4 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 4), vx_load(src2 + step * 4)));
|
|
|
|
v_uint64 c5 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 5), vx_load(src2 + step * 5)));
|
|
|
|
v_uint64 c6 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 6), vx_load(src2 + step * 6)));
|
|
|
|
v_uint64 c7 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 7), vx_load(src2 + step * 7)));
|
|
|
|
v_store(dst, v_pack_b(c0, c1, c2, c3, c4, c5, c6, c7));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
#endif // CV_SIMD
|
|
|
|
|
|
|
|
//////////////////////////// Loops /////////////////////////////////
|
|
|
|
|
|
|
|
template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
|
|
|
|
static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
|
|
|
|
{
|
|
|
|
typedef OP<T1, Tvec> op;
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int wide_step = VTraits<Tvec>::vlanes() * sizeof(T1);
|
2018-07-26 04:00:37 +08:00
|
|
|
#endif // CV_SIMD
|
|
|
|
|
|
|
|
step1 /= sizeof(T1);
|
|
|
|
step2 /= sizeof(T1);
|
|
|
|
|
|
|
|
for (; height--; src1 += step1, src2 += step2, dst += step)
|
|
|
|
{
|
|
|
|
int x = 0;
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
for (; x <= width - wide_step; x += wide_step)
|
|
|
|
{
|
|
|
|
ldr::l(src1 + x, src2 + x, dst + x);
|
|
|
|
}
|
|
|
|
#endif // CV_SIMD
|
|
|
|
|
|
|
|
#if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
|
|
|
|
for (; x <= width - 4; x += 4)
|
|
|
|
{
|
|
|
|
uchar t0 = op::r(src1[x], src2[x]);
|
|
|
|
uchar t1 = op::r(src1[x + 1], src2[x + 1]);
|
|
|
|
dst[x] = t0; dst[x + 1] = t1;
|
|
|
|
|
|
|
|
t0 = op::r(src1[x + 2], src2[x + 2]);
|
|
|
|
t1 = op::r(src1[x + 3], src2[x + 3]);
|
|
|
|
dst[x + 2] = t0; dst[x + 3] = t1;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
for (; x < width; x++)
|
|
|
|
dst[x] = op::r(src1[x], src2[x]);
|
|
|
|
}
|
|
|
|
|
|
|
|
vx_cleanup();
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
|
|
|
|
uchar* dst, size_t step, int width, int height, int cmpop)
|
|
|
|
{
|
|
|
|
switch(cmpop)
|
|
|
|
{
|
|
|
|
case CMP_LT:
|
|
|
|
cmp_loop<op_cmplt, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
break;
|
|
|
|
case CMP_GT:
|
|
|
|
cmp_loop<op_cmplt, T1, Tvec>(src2, step2, src1, step1, dst, step, width, height);
|
|
|
|
break;
|
|
|
|
case CMP_LE:
|
|
|
|
cmp_loop<op_cmple, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
break;
|
|
|
|
case CMP_GE:
|
|
|
|
cmp_loop<op_cmple, T1, Tvec>(src2, step2, src1, step1, dst, step, width, height);
|
|
|
|
break;
|
|
|
|
case CMP_EQ:
|
|
|
|
cmp_loop<op_cmpeq, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
CV_Assert(cmpop == CMP_NE);
|
|
|
|
cmp_loop<op_cmpne, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
template< template<typename T1, typename Tvec> class OP, typename T1>
|
|
|
|
static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
|
|
|
|
{
|
|
|
|
typedef OP<T1, v_int32 /*dummy*/> op;
|
|
|
|
|
|
|
|
step1 /= sizeof(T1);
|
|
|
|
step2 /= sizeof(T1);
|
|
|
|
|
|
|
|
for (; height--; src1 += step1, src2 += step2, dst += step)
|
|
|
|
{
|
|
|
|
int x = 0;
|
|
|
|
|
|
|
|
for (; x <= width - 4; x += 4)
|
|
|
|
{
|
|
|
|
uchar t0 = op::r(src1[x], src2[x]);
|
|
|
|
uchar t1 = op::r(src1[x + 1], src2[x + 1]);
|
|
|
|
dst[x] = t0; dst[x + 1] = t1;
|
|
|
|
|
|
|
|
t0 = op::r(src1[x + 2], src2[x + 2]);
|
|
|
|
t1 = op::r(src1[x + 3], src2[x + 3]);
|
|
|
|
dst[x + 2] = t0; dst[x + 3] = t1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (; x < width; x++)
|
|
|
|
dst[x] = op::r(src1[x], src2[x]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2, size_t step2,
|
|
|
|
uchar* dst, size_t step, int width, int height, int cmpop)
|
|
|
|
{
|
|
|
|
switch(cmpop)
|
|
|
|
{
|
|
|
|
case CMP_LT:
|
|
|
|
cmp_loop_nosimd<op_cmplt, double>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
break;
|
|
|
|
case CMP_GT:
|
|
|
|
cmp_loop_nosimd<op_cmplt, double>(src2, step2, src1, step1, dst, step, width, height);
|
|
|
|
break;
|
|
|
|
case CMP_LE:
|
|
|
|
cmp_loop_nosimd<op_cmple, double>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
break;
|
|
|
|
case CMP_GE:
|
|
|
|
cmp_loop_nosimd<op_cmple, double>(src2, step2, src1, step1, dst, step, width, height);
|
|
|
|
break;
|
|
|
|
case CMP_EQ:
|
|
|
|
cmp_loop_nosimd<op_cmpeq, double>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
CV_Assert(cmpop == CMP_NE);
|
|
|
|
cmp_loop_nosimd<op_cmpne, double>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
|
|
|
|
#endif // ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#ifndef SIMD_GUARD
|
|
|
|
#define CMP_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \
|
|
|
|
uchar* dst, size_t step, int width, int height
|
|
|
|
|
|
|
|
#define CMP_ARGS_PASS src1, step1, src2, step2, dst, step, width, height
|
|
|
|
#endif // SIMD_GUARD
|
|
|
|
|
|
|
|
#undef DECLARE_SIMD_FUN
|
|
|
|
#define DECLARE_SIMD_FUN(fun, _T1) void fun(CMP_ARGS(_T1), int cmpop);
|
|
|
|
|
|
|
|
#undef DISPATCH_SIMD_FUN
|
|
|
|
#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \
|
|
|
|
void fun(CMP_ARGS(_T1), void* _cmpop) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
CALL_HAL(fun, __CV_CAT(cv_hal_, fun), CMP_ARGS_PASS, *(int*)_cmpop) \
|
|
|
|
ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), CMP_ARGS_PASS, *(int*)_cmpop) \
|
|
|
|
CV_CPU_DISPATCH(fun, (CMP_ARGS_PASS, *(int*)_cmpop), CV_CPU_DISPATCH_MODES_ALL); \
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef DEFINE_SIMD_FUN
|
|
|
|
#define DEFINE_SIMD_FUN(fun, _T1, _Tvec, ...) \
|
|
|
|
void fun(CMP_ARGS(_T1), int cmpop) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
cmp_loop<_T1, _Tvec>(CMP_ARGS_PASS, cmpop); \
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef DEFINE_NOSIMD_FUN
|
|
|
|
#define DEFINE_NOSIMD_FUN(fun, _T1, _Tvec, ...) \
|
|
|
|
void fun(CMP_ARGS(_T1), int cmpop) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
cmp_loop_nosimd(CMP_ARGS_PASS, cmpop); \
|
|
|
|
}
|
|
|
|
|
|
|
|
// todo: try to avoid define dispatcher functions using macros with these such cases
|
Fix compilation on some 32-bit windows
I do not have more info on the platform as it is internal.
Without this fix, the error is:
core/src/arithm.simd.hpp:868:1: error: too few arguments provided to function-like macro invocation
868 | DEFINE_SIMD_ALL(cmp)
| ^
./third_party/OpenCV/public/modules/./core/src/arithm.simd.hpp:93:5: note: expanded from macro 'DEFINE_SIMD_ALL'
93 | DEFINE_SIMD_NSAT(fun, __VA_ARGS__)
| ^
./third_party/OpenCV/public/modules/./core/src/arithm.simd.hpp:89:5: note: expanded from macro 'DEFINE_SIMD_NSAT'
89 | DEFINE_SIMD_F64(fun, __VA_ARGS__)
| ^
./third_party/OpenCV/public/modules/./core/src/arithm.simd.hpp:77:9: note: expanded from macro 'DEFINE_SIMD_F64'
77 | DEFINE_NOSIMD(__CV_CAT(fun, 64f), double, __VA_ARGS__)
| ^
./third_party/OpenCV/public/modules/./core/src/arithm.simd.hpp:47:56: note: expanded from macro 'DEFINE_NOSIMD'
47 | DEFINE_NOSIMD_FUN(fun_name, c_type, __VA_ARGS__)
| ^
./third_party/OpenCV/public/modules/./core/src/arithm.simd.hpp:860:9: note: macro 'DEFINE_NOSIMD_FUN' defined here
860 | #define DEFINE_NOSIMD_FUN(fun, _T1, _Tvec, ...) \
2023-11-29 23:27:11 +08:00
|
|
|
DEFINE_SIMD_ALL(cmp, void)
|
2018-07-26 04:00:37 +08:00
|
|
|
|
|
|
|
//=========================================================================
|
|
|
|
// scaling helpers for single and dual source
|
|
|
|
//
|
|
|
|
// Dual: Multiply, Div, AddWeighted
|
|
|
|
//
|
|
|
|
// Single: Reciprocal
|
|
|
|
//
|
|
|
|
//=========================================================================
|
|
|
|
|
|
|
|
#ifdef ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
//////////////////////////// Loaders ///////////////////////////////
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
// todo: add support for RW alignment & stream
|
|
|
|
template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
|
|
|
|
struct scalar_loader_n
|
|
|
|
{
|
|
|
|
void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst);
|
|
|
|
// single source
|
|
|
|
void l(const T1* src1, const T2* scalar, T1* dst);
|
|
|
|
};
|
|
|
|
|
|
|
|
template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
|
|
|
|
struct scalar_loader_n<sizeof(uchar), OP, T1, T2, Tvec>
|
|
|
|
{
|
|
|
|
typedef OP<T1, T2, v_int16> op;
|
|
|
|
|
|
|
|
static inline void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst)
|
|
|
|
{
|
|
|
|
v_int16 v_src1 = v_reinterpret_as_s16(vx_load_expand(src1));
|
|
|
|
v_int16 v_src2 = v_reinterpret_as_s16(vx_load_expand(src2));
|
|
|
|
|
|
|
|
v_int32 t0, t1, t2, t3;
|
|
|
|
v_expand(v_src1, t0, t2);
|
|
|
|
v_expand(v_src2, t1, t3);
|
|
|
|
|
|
|
|
v_float32 f0, f1, f2, f3;
|
|
|
|
f0 = v_cvt_f32(t0);
|
|
|
|
f1 = v_cvt_f32(t1);
|
|
|
|
f2 = v_cvt_f32(t2);
|
|
|
|
f3 = v_cvt_f32(t3);
|
|
|
|
|
|
|
|
f0 = op::r(f0, f1, scalar);
|
|
|
|
f2 = op::r(f2, f3, scalar);
|
|
|
|
|
|
|
|
v_int32 r0 = v_round(f0);
|
|
|
|
v_int32 r1 = v_round(f2);
|
|
|
|
|
|
|
|
store(dst, v_src2, r0, r1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void l(const T1* src1, const T2* scalar, T1* dst)
|
|
|
|
{
|
|
|
|
v_int16 v_src1 = v_reinterpret_as_s16(vx_load_expand(src1));
|
|
|
|
|
|
|
|
v_int32 t0, t1;
|
|
|
|
v_expand(v_src1, t0, t1);
|
|
|
|
|
|
|
|
v_float32 f0, f1;
|
|
|
|
f0 = v_cvt_f32(t0);
|
|
|
|
f1 = v_cvt_f32(t1);
|
|
|
|
|
|
|
|
f0 = op::r(f0, scalar);
|
|
|
|
f1 = op::r(f1, scalar);
|
|
|
|
|
|
|
|
v_int32 r0 = v_round(f0);
|
|
|
|
v_int32 r1 = v_round(f1);
|
|
|
|
|
|
|
|
store(dst, v_src1, r0, r1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void store(uchar* dst, const v_int16& src, const v_int32& a, const v_int32& b)
|
|
|
|
{
|
|
|
|
v_pack_u_store(dst, op::pre(src, v_pack(a, b)));
|
|
|
|
}
|
|
|
|
static inline void store(schar* dst, const v_int16& src, const v_int32& a, const v_int32& b)
|
|
|
|
{
|
|
|
|
v_pack_store(dst, op::pre(src, v_pack(a, b)));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
|
|
|
|
struct scalar_loader_n<sizeof(ushort), OP, T1, T2, Tvec>
|
|
|
|
{
|
|
|
|
typedef typename V_RegTraits<Tvec>::w_reg Twvec;
|
|
|
|
typedef OP<T1, T2, Tvec> op;
|
|
|
|
|
|
|
|
static inline void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst)
|
|
|
|
{
|
|
|
|
Tvec v_src1 = vx_load(src1);
|
|
|
|
Tvec v_src2 = vx_load(src2);
|
|
|
|
|
|
|
|
Twvec t0, t1, t2, t3;
|
|
|
|
v_expand(v_src1, t0, t2);
|
|
|
|
v_expand(v_src2, t1, t3);
|
|
|
|
|
|
|
|
v_float32 f0, f1, f2, f3;
|
|
|
|
f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
|
|
|
|
f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
|
|
|
|
f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
|
|
|
|
f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
|
|
|
|
|
|
|
|
f0 = op::r(f0, f1, scalar);
|
|
|
|
f2 = op::r(f2, f3, scalar);
|
|
|
|
|
|
|
|
v_int32 r0 = v_round(f0);
|
|
|
|
v_int32 r1 = v_round(f2);
|
|
|
|
|
|
|
|
store(dst, v_src2, r0, r1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void l(const T1* src1, const T2* scalar, T1* dst)
|
|
|
|
{
|
|
|
|
Tvec v_src1 = vx_load(src1);
|
|
|
|
|
|
|
|
Twvec t0, t1;
|
|
|
|
v_expand(v_src1, t0, t1);
|
|
|
|
|
|
|
|
v_float32 f0, f1;
|
|
|
|
f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
|
|
|
|
f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
|
|
|
|
|
|
|
|
f0 = op::r(f0, scalar);
|
|
|
|
f1 = op::r(f1, scalar);
|
|
|
|
|
|
|
|
v_int32 r0 = v_round(f0);
|
|
|
|
v_int32 r1 = v_round(f1);
|
|
|
|
|
|
|
|
store(dst, v_src1, r0, r1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void store(ushort* dst, const Tvec& src, const v_int32& a, const v_int32& b)
|
|
|
|
{
|
|
|
|
v_store(dst, op::pre(src, v_pack_u(a, b)));
|
|
|
|
}
|
|
|
|
static inline void store(short* dst, const Tvec& src, const v_int32& a, const v_int32& b)
|
|
|
|
{
|
|
|
|
v_store(dst, op::pre(src, v_pack(a, b)));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template<template<typename T1, typename T2, typename Tvec> class OP, typename T2>
|
|
|
|
struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
|
|
|
|
{
|
|
|
|
typedef OP<int, T2, v_int32> op;
|
|
|
|
|
|
|
|
static inline void l(const int* src1, const int* src2, const T2* scalar, int* dst)
|
|
|
|
{
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int step = VTraits<v_int32>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
v_int32 v_src1 = vx_load(src1);
|
|
|
|
v_int32 v_src2 = vx_load(src2);
|
|
|
|
v_int32 v_src1s = vx_load(src1 + step);
|
|
|
|
v_int32 v_src2s = vx_load(src2 + step);
|
|
|
|
|
|
|
|
v_float32 f0, f1, f2, f3;
|
|
|
|
f0 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
|
|
|
|
f1 = v_cvt_f32(v_reinterpret_as_s32(v_src2));
|
|
|
|
f2 = v_cvt_f32(v_reinterpret_as_s32(v_src1s));
|
|
|
|
f3 = v_cvt_f32(v_reinterpret_as_s32(v_src2s));
|
|
|
|
|
|
|
|
f0 = op::r(f0, f1, scalar);
|
|
|
|
f2 = op::r(f2, f3, scalar);
|
|
|
|
|
|
|
|
v_int32 r0 = v_round(f0);
|
|
|
|
v_int32 r1 = v_round(f2);
|
|
|
|
|
|
|
|
r0 = op::pre(v_src2, r0);
|
|
|
|
r1 = op::pre(v_src2s, r1);
|
|
|
|
|
|
|
|
v_store(dst, r0);
|
|
|
|
v_store(dst + step, r1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void l(const int* src1, const T2* scalar, int* dst)
|
|
|
|
{
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int step = VTraits<v_int32>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
v_int32 v_src1 = vx_load(src1);
|
|
|
|
v_int32 v_src1s = vx_load(src1 + step);
|
|
|
|
|
|
|
|
v_float32 f0, f1;
|
|
|
|
f0 = v_cvt_f32(v_src1);
|
|
|
|
f1 = v_cvt_f32(v_src1s);
|
|
|
|
|
|
|
|
f0 = op::r(f0, scalar);
|
|
|
|
f1 = op::r(f1, scalar);
|
|
|
|
|
|
|
|
v_int32 r0 = v_round(f0);
|
|
|
|
v_int32 r1 = v_round(f1);
|
|
|
|
|
|
|
|
r0 = op::pre(v_src1, r0);
|
|
|
|
r1 = op::pre(v_src1s, r1);
|
|
|
|
|
|
|
|
v_store(dst, r0);
|
|
|
|
v_store(dst + step, r1);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template<template<typename T1, typename T2, typename Tvec> class OP, typename T2>
|
|
|
|
struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
|
|
|
|
{
|
|
|
|
typedef OP<float, T2, v_float32> op;
|
|
|
|
static inline void l(const float* src1, const float* src2, const T2* scalar, float* dst)
|
|
|
|
{
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int step = VTraits<v_float32>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
v_float32 v_src1 = vx_load(src1);
|
|
|
|
v_float32 v_src2 = vx_load(src2);
|
|
|
|
v_float32 v_src1s = vx_load(src1 + step);
|
|
|
|
v_float32 v_src2s = vx_load(src2 + step);
|
|
|
|
|
|
|
|
v_float32 r0 = op::r(v_src1, v_src2, scalar);
|
|
|
|
v_float32 r1 = op::r(v_src1s, v_src2s, scalar);
|
|
|
|
|
|
|
|
v_store(dst, r0);
|
|
|
|
v_store(dst + step, r1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void l(const float* src1, const T2* scalar, float* dst)
|
|
|
|
{
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int step = VTraits<v_float32>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
v_float32 v_src1 = vx_load(src1);
|
|
|
|
v_float32 v_src1s = vx_load(src1 + step);
|
|
|
|
|
|
|
|
v_float32 r0 = op::r(v_src1, scalar);
|
|
|
|
v_float32 r1 = op::r(v_src1s, scalar);
|
|
|
|
|
|
|
|
v_store(dst, r0);
|
|
|
|
v_store(dst + step, r1);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
#endif // CV_SIMD
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
template<template<typename T1, typename T2, typename Tvec> class OP>
|
|
|
|
struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
|
|
|
|
{
|
|
|
|
typedef OP<int, float, v_int32> op;
|
|
|
|
typedef OP<double, double, v_float64> op64;
|
|
|
|
|
|
|
|
static inline void l(const int* src1, const int* src2, const double* scalar, int* dst)
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
const int step = VTraits<v_int32>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
v_int32 v_src1 = vx_load(src1);
|
|
|
|
v_int32 v_src2 = vx_load(src2);
|
|
|
|
v_int32 v_src1s = vx_load(src1 + step);
|
|
|
|
v_int32 v_src2s = vx_load(src2 + step);
|
|
|
|
|
|
|
|
v_int32 r0 = r(v_src1, v_src2, scalar);
|
|
|
|
v_int32 r1 = r(v_src1s, v_src2s, scalar);
|
|
|
|
|
|
|
|
r0 = op::pre(v_src2, r0);
|
|
|
|
r1 = op::pre(v_src2s, r1);
|
|
|
|
|
|
|
|
v_store(dst, r0);
|
|
|
|
v_store(dst + step, r1);
|
|
|
|
}
|
|
|
|
static inline void l(const int* src1, const double* scalar, int* dst)
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
const int step = VTraits<v_int32>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
v_int32 v_src1 = vx_load(src1);
|
|
|
|
v_int32 v_src1s = vx_load(src1 + step);
|
|
|
|
|
|
|
|
v_int32 r0 = r(v_src1, scalar);
|
|
|
|
v_int32 r1 = r(v_src1s, scalar);
|
|
|
|
|
|
|
|
r0 = op::pre(v_src1, r0);
|
|
|
|
r1 = op::pre(v_src1s, r1);
|
|
|
|
|
|
|
|
v_store(dst, r0);
|
|
|
|
v_store(dst + step, r1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline v_int32 r(const v_int32& a, const v_int32& b, const double* scalar)
|
|
|
|
{
|
|
|
|
v_float64 f0, f1, f2, f3;
|
|
|
|
f0 = v_cvt_f64(a);
|
|
|
|
f1 = v_cvt_f64_high(a);
|
|
|
|
f2 = v_cvt_f64(b);
|
|
|
|
f3 = v_cvt_f64_high(b);
|
|
|
|
|
|
|
|
v_float64 r0 = op64::r(f0, f2, scalar);
|
|
|
|
v_float64 r1 = op64::r(f1, f3, scalar);
|
|
|
|
|
|
|
|
return v_round(r0, r1);
|
|
|
|
}
|
|
|
|
static inline v_int32 r(const v_int32& a, const double* scalar)
|
|
|
|
{
|
|
|
|
v_float64 f0, f1;
|
|
|
|
f0 = v_cvt_f64(a);
|
|
|
|
f1 = v_cvt_f64_high(a);
|
|
|
|
|
|
|
|
v_float64 r0 = op64::r(f0, scalar);
|
|
|
|
v_float64 r1 = op64::r(f1, scalar);
|
|
|
|
|
|
|
|
return v_round(r0, r1);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template<template<typename T1, typename T2, typename Tvec> class OP>
|
|
|
|
struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
|
|
|
|
{
|
|
|
|
typedef OP<float, float, v_float32> op;
|
|
|
|
typedef OP<double, double, v_float64> op64;
|
|
|
|
|
|
|
|
static inline void l(const float* src1, const float* src2, const double* scalar, float* dst)
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
const int step = VTraits<v_float32>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
v_float32 v_src1 = vx_load(src1);
|
|
|
|
v_float32 v_src2 = vx_load(src2);
|
|
|
|
v_float32 v_src1s = vx_load(src1 + step);
|
|
|
|
v_float32 v_src2s = vx_load(src2 + step);
|
|
|
|
|
|
|
|
v_float32 r0 = r(v_src1, v_src2, scalar);
|
|
|
|
v_float32 r1 = r(v_src1s, v_src2s, scalar);
|
|
|
|
|
|
|
|
v_store(dst, r0);
|
|
|
|
v_store(dst + step, r1);
|
|
|
|
}
|
|
|
|
static inline void l(const float* src1, const double* scalar, float* dst)
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
const int step = VTraits<v_float32>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
v_float32 v_src1 = vx_load(src1);
|
|
|
|
v_float32 v_src1s = vx_load(src1 + step);
|
|
|
|
|
|
|
|
v_float32 r0 = r(v_src1, scalar);
|
|
|
|
v_float32 r1 = r(v_src1s, scalar);
|
|
|
|
|
|
|
|
v_store(dst, r0);
|
|
|
|
v_store(dst + step, r1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline v_float32 r(const v_float32& a, const v_float32& b, const double* scalar)
|
|
|
|
{
|
|
|
|
v_float64 f0, f1, f2, f3;
|
|
|
|
f0 = v_cvt_f64(a);
|
|
|
|
f1 = v_cvt_f64_high(a);
|
|
|
|
f2 = v_cvt_f64(b);
|
|
|
|
f3 = v_cvt_f64_high(b);
|
|
|
|
|
|
|
|
v_float64 r0 = op64::r(f0, f2, scalar);
|
|
|
|
v_float64 r1 = op64::r(f1, f3, scalar);
|
|
|
|
|
|
|
|
return v_cvt_f32(r0, r1);
|
|
|
|
}
|
|
|
|
static inline v_float32 r(const v_float32& a, const double* scalar)
|
|
|
|
{
|
|
|
|
v_float64 f0, f1;
|
|
|
|
f0 = v_cvt_f64(a);
|
|
|
|
f1 = v_cvt_f64_high(a);
|
|
|
|
|
|
|
|
v_float64 r0 = op64::r(f0, scalar);
|
|
|
|
v_float64 r1 = op64::r(f1, scalar);
|
|
|
|
|
|
|
|
return v_cvt_f32(r0, r1);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template<template<typename T1, typename T2, typename Tvec> class OP>
|
|
|
|
struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
|
|
|
|
{
|
|
|
|
typedef OP<double, double, v_float64> op;
|
|
|
|
|
|
|
|
static inline void l(const double* src1, const double* src2, const double* scalar, double* dst)
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
const int step = VTraits<v_float64>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
v_float64 v_src1 = vx_load(src1);
|
|
|
|
v_float64 v_src2 = vx_load(src2);
|
|
|
|
v_float64 v_src1s = vx_load(src1 + step);
|
|
|
|
v_float64 v_src2s = vx_load(src2 + step);
|
|
|
|
|
|
|
|
v_float64 r0 = op::r(v_src1, v_src2, scalar);
|
|
|
|
v_float64 r1 = op::r(v_src1s, v_src2s, scalar);
|
|
|
|
|
|
|
|
v_store(dst, r0);
|
|
|
|
v_store(dst + step, r1);
|
|
|
|
}
|
|
|
|
static inline void l(const double* src1, const double* scalar, double* dst)
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
const int step = VTraits<v_float64>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
v_float64 v_src1 = vx_load(src1);
|
|
|
|
v_float64 v_src1s = vx_load(src1 + step);
|
|
|
|
|
|
|
|
v_float64 r0 = op::r(v_src1, scalar);
|
|
|
|
v_float64 r1 = op::r(v_src1s, scalar);
|
|
|
|
|
|
|
|
v_store(dst, r0);
|
|
|
|
v_store(dst + step, r1);
|
|
|
|
}
|
|
|
|
};
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#endif // (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
|
|
|
|
//////////////////////////// Loops /////////////////////////////////
|
|
|
|
|
|
|
|
// dual source
|
|
|
|
template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
|
|
|
|
static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
|
|
|
|
T1* dst, size_t step, int width, int height, const T2* scalar)
|
|
|
|
{
|
|
|
|
typedef OP<T1, T2, Tvec> op;
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
|
|
|
|
sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
#endif // CV_SIMD
|
|
|
|
|
|
|
|
step1 /= sizeof(T1);
|
|
|
|
step2 /= sizeof(T1);
|
|
|
|
step /= sizeof(T1);
|
|
|
|
|
|
|
|
for (; height--; src1 += step1, src2 += step2, dst += step)
|
|
|
|
{
|
|
|
|
int x = 0;
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
for (; x <= width - wide_step; x += wide_step)
|
|
|
|
{
|
|
|
|
ldr::l(src1 + x, src2 + x, scalar, dst + x);
|
|
|
|
}
|
|
|
|
#endif // CV_SIMD
|
|
|
|
|
|
|
|
#if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
|
|
|
|
for (; x <= width - 4; x += 4)
|
|
|
|
{
|
|
|
|
T1 t0 = op::r(src1[x], src2[x], scalar);
|
|
|
|
T1 t1 = op::r(src1[x + 1], src2[x + 1], scalar);
|
|
|
|
dst[x] = t0; dst[x + 1] = t1;
|
|
|
|
|
|
|
|
t0 = op::r(src1[x + 2], src2[x + 2], scalar);
|
|
|
|
t1 = op::r(src1[x + 3], src2[x + 3], scalar);
|
|
|
|
dst[x + 2] = t0; dst[x + 3] = t1;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
for (; x < width; ++x)
|
|
|
|
dst[x] = op::r(src1[x], src2[x], scalar);
|
|
|
|
}
|
|
|
|
|
|
|
|
vx_cleanup();
|
|
|
|
}
|
|
|
|
|
|
|
|
// single source
|
|
|
|
template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
|
|
|
|
static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
|
|
|
|
{
|
|
|
|
typedef OP<T1, T2, Tvec> op;
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
|
|
|
|
sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
|
2018-07-26 04:00:37 +08:00
|
|
|
#endif // CV_SIMD
|
|
|
|
|
|
|
|
step1 /= sizeof(T1);
|
|
|
|
step /= sizeof(T1);
|
|
|
|
|
|
|
|
for (; height--; src1 += step1, dst += step)
|
|
|
|
{
|
|
|
|
int x = 0;
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
for (; x <= width - wide_step; x += wide_step)
|
|
|
|
{
|
|
|
|
ldr::l(src1 + x, scalar, dst + x);
|
|
|
|
}
|
|
|
|
#endif // CV_SIMD
|
|
|
|
|
|
|
|
#if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
|
|
|
|
for (; x <= width - 4; x += 4)
|
|
|
|
{
|
|
|
|
T1 t0 = op::r(src1[x], scalar);
|
|
|
|
T1 t1 = op::r(src1[x + 1], scalar);
|
|
|
|
dst[x] = t0; dst[x + 1] = t1;
|
|
|
|
|
|
|
|
t0 = op::r(src1[x + 2], scalar);
|
|
|
|
t1 = op::r(src1[x + 3], scalar);
|
|
|
|
dst[x + 2] = t0; dst[x + 3] = t1;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
for (; x < width; ++x)
|
|
|
|
dst[x] = op::r(src1[x], scalar);
|
|
|
|
}
|
|
|
|
|
|
|
|
vx_cleanup();
|
|
|
|
}
|
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
// dual source
|
|
|
|
template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
|
|
|
|
static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2,
|
|
|
|
T1* dst, size_t step, int width, int height, const T2* scalar)
|
|
|
|
{
|
|
|
|
typedef OP<T1, T2, Tvec> op;
|
|
|
|
|
|
|
|
step1 /= sizeof(T1);
|
|
|
|
step2 /= sizeof(T1);
|
|
|
|
step /= sizeof(T1);
|
|
|
|
|
|
|
|
for (; height--; src1 += step1, src2 += step2, dst += step)
|
|
|
|
{
|
|
|
|
int x = 0;
|
|
|
|
|
|
|
|
for (; x <= width - 4; x += 4)
|
|
|
|
{
|
|
|
|
T1 t0 = op::r(src1[x], src2[x], scalar);
|
|
|
|
T1 t1 = op::r(src1[x + 1], src2[x + 1], scalar);
|
|
|
|
dst[x] = t0; dst[x + 1] = t1;
|
|
|
|
|
|
|
|
t0 = op::r(src1[x + 2], src2[x + 2], scalar);
|
|
|
|
t1 = op::r(src1[x + 3], src2[x + 3], scalar);
|
|
|
|
dst[x + 2] = t0; dst[x + 3] = t1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (; x < width; ++x)
|
|
|
|
dst[x] = op::r(src1[x], src2[x], scalar);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// single source
|
|
|
|
template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
|
|
|
|
static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
|
|
|
|
{
|
|
|
|
typedef OP<T1, T2, Tvec> op;
|
|
|
|
|
|
|
|
step1 /= sizeof(T1);
|
|
|
|
step /= sizeof(T1);
|
|
|
|
|
|
|
|
for (; height--; src1 += step1, dst += step)
|
|
|
|
{
|
|
|
|
int x = 0;
|
|
|
|
|
|
|
|
for (; x <= width - 4; x += 4)
|
|
|
|
{
|
|
|
|
T1 t0 = op::r(src1[x], scalar);
|
|
|
|
T1 t1 = op::r(src1[x + 1], scalar);
|
|
|
|
dst[x] = t0; dst[x + 1] = t1;
|
|
|
|
|
|
|
|
t0 = op::r(src1[x + 2], scalar);
|
|
|
|
t1 = op::r(src1[x + 3], scalar);
|
|
|
|
dst[x + 2] = t0; dst[x + 3] = t1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (; x < width; ++x)
|
|
|
|
dst[x] = op::r(src1[x], scalar);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#define SCALAR_LOOP64F scalar_loop_nosimd
|
|
|
|
#else
|
|
|
|
#define SCALAR_LOOP64F scalar_loop
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
|
|
|
|
#endif // ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
//=========================================================================
|
|
|
|
// Multiply
|
|
|
|
//=========================================================================
|
|
|
|
|
|
|
|
#ifdef ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
///////////////////////////// Operations //////////////////////////////////
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_mul
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
{ return v_mul(a, b); }
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 a, T1 b)
|
|
|
|
{ return saturate_cast<T1>(a * b); }
|
|
|
|
};
|
|
|
|
|
|
|
|
template<typename T1, typename T2, typename Tvec>
|
|
|
|
struct op_mul_scale
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
|
|
|
|
{
|
|
|
|
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
return v_mul(v_scalar , a , b);
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
2022-07-20 01:02:00 +08:00
|
|
|
#endif
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 a, T1 b, const T2* scalar)
|
|
|
|
{ return c_mul(a, b, *scalar); }
|
|
|
|
static inline Tvec pre(const Tvec&, const Tvec& res)
|
|
|
|
{ return res; }
|
|
|
|
};
|
|
|
|
|
|
|
|
template<>
|
|
|
|
struct op_mul_scale<double, double, v_float64>
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
|
|
|
|
{
|
|
|
|
const v_float64 v_scalar = vx_setall_f64(*scalar);
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
return v_mul(v_mul(v_scalar, a), b);
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
static inline double r(double a, double b, const double* scalar)
|
|
|
|
{ return c_mul(a, b, *scalar); }
|
|
|
|
static inline v_float64 pre(const v_float64&, const v_float64& res)
|
|
|
|
{ return res; }
|
|
|
|
};
|
|
|
|
|
|
|
|
//////////////////////////// Loops /////////////////////////////////
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
static void mul_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
|
|
|
|
T1* dst, size_t step, int width, int height, const double* scalar)
|
|
|
|
{
|
|
|
|
float fscalar = (float)*scalar;
|
|
|
|
if (std::fabs(fscalar - 1.0f) <= FLT_EPSILON)
|
|
|
|
{
|
|
|
|
bin_loop<op_mul, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
scalar_loop<op_mul_scale, T1, float, Tvec>(src1, step1, src2, step2,
|
|
|
|
dst, step, width, height, &fscalar);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
static void mul_loop_d(const T1* src1, size_t step1, const T1* src2, size_t step2,
|
|
|
|
T1* dst, size_t step, int width, int height, const double* scalar)
|
|
|
|
{
|
|
|
|
if (std::fabs(*scalar - 1.0) <= FLT_EPSILON)
|
|
|
|
{
|
|
|
|
bin_loop<op_mul, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
SCALAR_LOOP64F<op_mul_scale, T1, double, Tvec>(src1, step1, src2, step2,
|
|
|
|
dst, step, width, height, scalar);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
void mul_loop_d<double, v_float64>(const double* src1, size_t step1, const double* src2, size_t step2,
|
|
|
|
double* dst, size_t step, int width, int height, const double* scalar)
|
|
|
|
{
|
|
|
|
if (*scalar == 1.0)
|
|
|
|
{
|
|
|
|
BIN_LOOP64F<op_mul, double, v_float64>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
SCALAR_LOOP64F<op_mul_scale, double, double, v_float64>(src1, step1, src2, step2,
|
|
|
|
dst, step, width, height, scalar);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif // ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#undef SCALAR_ARGS
|
|
|
|
#define SCALAR_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \
|
|
|
|
_T1* dst, size_t step, int width, int height
|
|
|
|
|
|
|
|
#undef SCALAR_ARGS_PASS
|
|
|
|
#define SCALAR_ARGS_PASS src1, step1, src2, step2, dst, step, width, height
|
|
|
|
|
|
|
|
#undef DECLARE_SIMD_FUN
|
|
|
|
#define DECLARE_SIMD_FUN(fun, _T1) void fun(SCALAR_ARGS(_T1), const double* scalar);
|
|
|
|
|
|
|
|
#undef DISPATCH_SIMD_FUN
|
|
|
|
#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \
|
|
|
|
void fun(SCALAR_ARGS(_T1), void* scalar) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
CALL_HAL(fun, __CV_CAT(cv_hal_, fun), \
|
|
|
|
SCALAR_ARGS_PASS, *(const double*)scalar) \
|
|
|
|
ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), \
|
|
|
|
SCALAR_ARGS_PASS, *(const double*)scalar) \
|
|
|
|
CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar), \
|
|
|
|
CV_CPU_DISPATCH_MODES_ALL); \
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef DEFINE_SIMD_FUN
|
|
|
|
#define DEFINE_SIMD_FUN(fun, _T1, _Tvec, op) \
|
|
|
|
void fun(SCALAR_ARGS(_T1), const double* scalar) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
op<_T1, _Tvec>(SCALAR_ARGS_PASS, scalar); \
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef DEFINE_NOSIMD_FUN
|
|
|
|
#define DEFINE_NOSIMD_FUN(fun, _T1, _OP) \
|
|
|
|
DEFINE_SIMD_FUN(fun, _T1, v_float64, _OP)
|
|
|
|
|
|
|
|
DEFINE_SIMD_SAT(mul, mul_loop)
|
|
|
|
DEFINE_SIMD_F32(mul, mul_loop_d)
|
|
|
|
DEFINE_SIMD_S32(mul, mul_loop_d)
|
|
|
|
DEFINE_SIMD_F64(mul, mul_loop_d)
|
|
|
|
|
|
|
|
//=========================================================================
|
|
|
|
// Div
|
|
|
|
//=========================================================================
|
|
|
|
|
|
|
|
#ifdef ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
///////////////////////////// Operations //////////////////////////////////
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
struct op_div_f
|
|
|
|
{
|
|
|
|
static inline Tvec r(const Tvec& a, const Tvec& b)
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
{ return v_div(a, b); }
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 a, T1 b)
|
|
|
|
{ return a / b; }
|
|
|
|
};
|
|
|
|
|
|
|
|
template<typename T1, typename T2, typename Tvec>
|
|
|
|
struct op_div_scale
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
|
|
|
|
{
|
|
|
|
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
return v_div(v_mul(a, v_scalar), b);
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
static inline Tvec pre(const Tvec& denom, const Tvec& res)
|
|
|
|
{
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const Tvec v_zero = vx_setall<typename VTraits<Tvec>::lane_type>(0);
|
|
|
|
return v_select(v_eq(denom, v_zero), v_zero, res);
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
2022-07-20 01:02:00 +08:00
|
|
|
#endif
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 a, T1 denom, const T2* scalar)
|
2018-11-11 04:10:57 +08:00
|
|
|
{
|
|
|
|
CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
|
|
|
|
return denom != (T1)0 ? c_div(a, denom, *scalar) : (T1)0;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template<>
|
|
|
|
struct op_div_scale<float, float, v_float32>
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-11-11 04:10:57 +08:00
|
|
|
static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
|
|
|
|
{
|
|
|
|
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
return v_div(v_mul(a, v_scalar), b);
|
2018-11-11 04:10:57 +08:00
|
|
|
}
|
2022-07-20 01:02:00 +08:00
|
|
|
#endif
|
2018-11-11 04:10:57 +08:00
|
|
|
static inline float r(float a, float denom, const float* scalar)
|
|
|
|
{ return c_div(a, denom, *scalar); }
|
2018-07-26 04:00:37 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
template<>
|
|
|
|
struct op_div_scale<double, double, v_float64>
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
|
|
|
|
{
|
|
|
|
const v_float64 v_scalar = vx_setall_f64(*scalar);
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
return v_div(v_mul(a, v_scalar), b);
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
static inline double r(double a, double denom, const double* scalar)
|
2018-11-11 04:10:57 +08:00
|
|
|
{ return c_div(a, denom, *scalar); }
|
2018-07-26 04:00:37 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
//////////////////////////// Loops /////////////////////////////////
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
static void div_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
|
|
|
|
T1* dst, size_t step, int width, int height, const double* scalar)
|
|
|
|
{
|
|
|
|
float fscalar = (float)*scalar;
|
|
|
|
// todo: add new intrinsics for integer divide
|
|
|
|
scalar_loop<op_div_scale, T1, float, Tvec>(src1, step1, src2, step2,
|
|
|
|
dst, step, width, height, &fscalar);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
void div_loop<float, v_float32>(const float* src1, size_t step1, const float* src2, size_t step2,
|
|
|
|
float* dst, size_t step, int width, int height, const double* scalar)
|
|
|
|
{
|
|
|
|
float fscalar = (float)*scalar;
|
|
|
|
if (std::fabs(fscalar - 1.0f) <= FLT_EPSILON)
|
|
|
|
{
|
|
|
|
bin_loop<op_div_f, float, v_float32>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
SCALAR_LOOP64F<op_div_scale, float, float, v_float32>(src1, step1, src2, step2,
|
|
|
|
dst, step, width, height, &fscalar);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
void div_loop<double, v_float64>(const double* src1, size_t step1, const double* src2, size_t step2,
|
|
|
|
double* dst, size_t step, int width, int height, const double* scalar)
|
|
|
|
{
|
|
|
|
if (*scalar == 1.0)
|
|
|
|
{
|
|
|
|
BIN_LOOP64F<op_div_f, double, v_float64>(src1, step1, src2, step2, dst, step, width, height);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
SCALAR_LOOP64F<op_div_scale, double, double, v_float64>(src1, step1, src2, step2,
|
|
|
|
dst, step, width, height, scalar);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif // ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
DEFINE_SIMD_ALL(div, div_loop)
|
|
|
|
|
|
|
|
//=========================================================================
|
|
|
|
// AddWeighted
|
|
|
|
//=========================================================================
|
|
|
|
|
|
|
|
#ifdef ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
///////////////////////////// Operations //////////////////////////////////
|
|
|
|
|
|
|
|
///// Add scale
|
|
|
|
template<typename T1, typename T2, typename Tvec>
|
|
|
|
struct op_add_scale
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
|
|
|
|
{
|
|
|
|
const v_float32 v_alpha = vx_setall_f32(*scalar);
|
|
|
|
return v_fma(a, v_alpha, b);
|
|
|
|
}
|
2022-07-20 01:02:00 +08:00
|
|
|
#endif
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 a, T1 b, const T2* scalar)
|
|
|
|
{ return c_add(a, b, *scalar); }
|
|
|
|
static inline Tvec pre(const Tvec&, const Tvec& res)
|
|
|
|
{ return res; }
|
|
|
|
};
|
|
|
|
|
|
|
|
template<>
|
|
|
|
struct op_add_scale<double, double, v_float64>
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
|
|
|
|
{
|
|
|
|
const v_float64 v_alpha = vx_setall_f64(*scalar);
|
|
|
|
return v_fma(a, v_alpha, b);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
static inline double r(double a, double b, const double* scalar)
|
|
|
|
{ return c_add(a, b, *scalar); }
|
|
|
|
static inline v_float64 pre(const v_float64&, const v_float64& res)
|
|
|
|
{ return res; }
|
|
|
|
};
|
|
|
|
|
|
|
|
///// Weighted sum
|
|
|
|
template<typename T1, typename T2, typename Tvec>
|
|
|
|
struct op_add_weighted
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
|
|
|
|
{
|
|
|
|
const v_float32 v_alpha = vx_setall_f32(scalars[0]);
|
|
|
|
const v_float32 v_beta = vx_setall_f32(scalars[1]);
|
|
|
|
const v_float32 v_gamma = vx_setall_f32(scalars[2]);
|
|
|
|
return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma));
|
|
|
|
}
|
2022-07-20 01:02:00 +08:00
|
|
|
#endif
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 a, T1 b, const T2* scalars)
|
|
|
|
{ return c_add(a, b, scalars[0], scalars[1], scalars[2]); }
|
|
|
|
static inline Tvec pre(const Tvec&, const Tvec& res)
|
|
|
|
{ return res; }
|
|
|
|
};
|
|
|
|
|
|
|
|
template<>
|
|
|
|
struct op_add_weighted<double, double, v_float64>
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars)
|
|
|
|
{
|
|
|
|
const v_float64 v_alpha = vx_setall_f64(scalars[0]);
|
|
|
|
const v_float64 v_beta = vx_setall_f64(scalars[1]);
|
|
|
|
const v_float64 v_gamma = vx_setall_f64(scalars[2]);
|
|
|
|
return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
static inline double r(double a, double b, const double* scalars)
|
|
|
|
{ return c_add(a, b, scalars[0], scalars[1], scalars[2]); }
|
|
|
|
static inline v_float64 pre(const v_float64&, const v_float64& res)
|
|
|
|
{ return res; }
|
|
|
|
};
|
|
|
|
|
|
|
|
//////////////////////////// Loops /////////////////////////////////
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
static void add_weighted_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
|
|
|
|
T1* dst, size_t step, int width, int height, const double* scalars)
|
|
|
|
{
|
|
|
|
float fscalars[] = {(float)scalars[0], (float)scalars[1], (float)scalars[2]};
|
|
|
|
if (fscalars[1] == 1.0f && fscalars[2] == 0.0f)
|
|
|
|
{
|
|
|
|
scalar_loop<op_add_scale, T1, float, Tvec>(src1, step1, src2, step2,
|
|
|
|
dst, step, width, height, fscalars);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
scalar_loop<op_add_weighted, T1, float, Tvec>(src1, step1, src2, step2,
|
|
|
|
dst, step, width, height, fscalars);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
static void add_weighted_loop_d(const T1* src1, size_t step1, const T1* src2, size_t step2,
|
|
|
|
T1* dst, size_t step, int width, int height, const double* scalars)
|
|
|
|
{
|
|
|
|
if (scalars[1] == 1.0 && scalars[2] == 0.0)
|
|
|
|
{
|
|
|
|
SCALAR_LOOP64F<op_add_scale, T1, double, Tvec>(src1, step1, src2, step2,
|
|
|
|
dst, step, width, height, scalars);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
SCALAR_LOOP64F<op_add_weighted, T1, double, Tvec>(src1, step1, src2, step2,
|
|
|
|
dst, step, width, height, scalars);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
void add_weighted_loop_d<double, v_float64>(const double* src1, size_t step1, const double* src2, size_t step2,
|
|
|
|
double* dst, size_t step, int width, int height, const double* scalars)
|
|
|
|
{
|
|
|
|
if (scalars[1] == 1.0 && scalars[2] == 0.0)
|
|
|
|
{
|
|
|
|
SCALAR_LOOP64F<op_add_scale, double, double, v_float64>(src1, step1, src2, step2,
|
|
|
|
dst, step, width, height, scalars);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
SCALAR_LOOP64F<op_add_weighted, double, double, v_float64>(src1, step1, src2, step2,
|
|
|
|
dst, step, width, height, scalars);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif // ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#undef DISPATCH_SIMD_FUN
|
|
|
|
#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \
|
|
|
|
void fun(SCALAR_ARGS(_T1), void* scalar) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
CALL_HAL(fun, __CV_CAT(cv_hal_, fun), \
|
|
|
|
SCALAR_ARGS_PASS, (const double*)scalar) \
|
|
|
|
ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), \
|
|
|
|
SCALAR_ARGS_PASS, (const double*)scalar) \
|
|
|
|
CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar), \
|
|
|
|
CV_CPU_DISPATCH_MODES_ALL); \
|
|
|
|
}
|
|
|
|
|
|
|
|
DEFINE_SIMD_SAT(addWeighted, add_weighted_loop)
|
|
|
|
DEFINE_SIMD_S32(addWeighted, add_weighted_loop_d)
|
|
|
|
DEFINE_SIMD_F32(addWeighted, add_weighted_loop_d)
|
|
|
|
DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
|
|
|
|
|
|
|
|
//=======================================
|
|
|
|
// Reciprocal
|
|
|
|
//=======================================
|
|
|
|
|
|
|
|
#ifdef ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
///////////////////////////// Operations //////////////////////////////////
|
|
|
|
|
|
|
|
template<typename T1, typename T2, typename Tvec>
|
|
|
|
struct op_recip
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_float32 r(const v_float32& a, const T2* scalar)
|
|
|
|
{
|
|
|
|
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
return v_div(v_scalar, a);
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
static inline Tvec pre(const Tvec& denom, const Tvec& res)
|
|
|
|
{
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
const Tvec v_zero = vx_setall<typename VTraits<Tvec>::lane_type>(0);
|
|
|
|
return v_select(v_eq(denom, v_zero), v_zero, res);
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
2022-07-20 01:02:00 +08:00
|
|
|
#endif
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline T1 r(T1 denom, const T2* scalar)
|
2018-11-11 04:10:57 +08:00
|
|
|
{
|
|
|
|
CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
|
|
|
|
return denom != (T1)0 ? c_div(*scalar, denom) : (T1)0;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
template<>
|
|
|
|
struct op_recip<float, float, v_float32>
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
2018-11-11 04:10:57 +08:00
|
|
|
static inline v_float32 r(const v_float32& a, const float* scalar)
|
|
|
|
{
|
|
|
|
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
return v_div(v_scalar, a);
|
2018-11-11 04:10:57 +08:00
|
|
|
}
|
2022-07-20 01:02:00 +08:00
|
|
|
#endif
|
2018-11-11 04:10:57 +08:00
|
|
|
static inline float r(float denom, const float* scalar)
|
|
|
|
{ return c_div(*scalar, denom); }
|
2018-07-26 04:00:37 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
template<>
|
|
|
|
struct op_recip<double, double, v_float64>
|
|
|
|
{
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
2018-07-26 04:00:37 +08:00
|
|
|
static inline v_float64 r(const v_float64& a, const double* scalar)
|
|
|
|
{
|
|
|
|
const v_float64 v_scalar = vx_setall_f64(*scalar);
|
Merge pull request #23980 from hanliutong:rewrite-core
Rewrite Universal Intrinsic code by using new API: Core module. #23980
The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API.
The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885.
Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are:
1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited
- ./modules/core/src/stat.simd.hpp
- ./modules/core/src/matrix_transform.cpp
- ./modules/core/src/matmul.simd.hpp
2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly.
- ./modules/core/src/mathfuncs_core.simd.hpp
```cpp
struct v_atan_f32
{
explicit v_atan_f32(const float& scale)
{
...
}
v_float32 compute(const v_float32& y, const v_float32& x)
{
...
}
...
v_float32 val90; // sizeless type can not used in a class
v_float32 val180;
v_float32 val360;
v_float32 s;
};
```
3. The API interface does not support/does not match
- ./modules/core/src/norm.cpp
Use `v_popcount`, ~~waiting for #23966~~ Fixed
- ./modules/core/src/has_non_zero.simd.hpp
Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed
```cpp
/** @brief Bitwise OR
Only for integer types. */
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
```
```cpp
#if CV_SIMD
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
int countSIMD = static_cast<int>((srcSimdEnd-src)/step);
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
v_type v1 = vx_load(src);
src += v_type::nlanes;
....
src += v_type::nlanes;
v0 |= v1; //Illegal ?
....
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
}
v_cleanup();
#endif
```
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-08-11 13:33:33 +08:00
|
|
|
return v_div(v_scalar, a);
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
static inline double r(double denom, const double* scalar)
|
2018-11-11 04:10:57 +08:00
|
|
|
{ return c_div(*scalar, denom); }
|
2018-07-26 04:00:37 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
//////////////////////////// Loops /////////////////////////////////
|
|
|
|
|
|
|
|
template<typename T1, typename Tvec>
|
|
|
|
static void recip_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const double* scalar)
|
|
|
|
{
|
|
|
|
float fscalar = (float)*scalar;
|
|
|
|
scalar_loop<op_recip, T1, float, Tvec>(src1, step1, dst, step, width, height, &fscalar);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
|
|
void recip_loop<double, v_float64>(const double* src1, size_t step1, double* dst, size_t step, int width, int height, const double* scalar)
|
|
|
|
{
|
|
|
|
SCALAR_LOOP64F<op_recip, double, double, v_float64>(src1, step1, dst, step, width, height, scalar);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif // ARITHM_DEFINITIONS_ONLY
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#undef SCALAR_ARGS
|
|
|
|
#define SCALAR_ARGS(_T1) const _T1* src1, size_t step1, _T1* dst, size_t step, int width, int height
|
|
|
|
|
|
|
|
#undef SCALAR_ARGS_PASS
|
|
|
|
#define SCALAR_ARGS_PASS src1, step1, dst, step, width, height
|
|
|
|
|
|
|
|
#undef DISPATCH_SIMD_FUN
|
|
|
|
#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...) \
|
|
|
|
void fun(const _T1*, size_t, SCALAR_ARGS(_T1), void* scalar) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
CALL_HAL(fun, __CV_CAT(cv_hal_, fun), \
|
|
|
|
SCALAR_ARGS_PASS, *(const double*)scalar) \
|
|
|
|
ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), \
|
|
|
|
SCALAR_ARGS_PASS, *(const double*)scalar) \
|
|
|
|
CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar), \
|
|
|
|
CV_CPU_DISPATCH_MODES_ALL); \
|
|
|
|
}
|
|
|
|
|
|
|
|
DEFINE_SIMD_ALL(recip, recip_loop)
|
|
|
|
|
|
|
|
#ifndef ARITHM_DISPATCHING_ONLY
|
|
|
|
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef SIMD_GUARD
|
|
|
|
#define SIMD_GUARD
|
|
|
|
#endif
|
|
|
|
|
2021-05-26 01:15:12 +08:00
|
|
|
}} // cv::hal::
|