2018-07-26 04:00:37 +08:00
|
|
|
// This file is part of OpenCV project.
|
|
|
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
|
|
// of this distribution and at http://opencv.org/license.html
|
|
|
|
|
|
|
|
#include "opencv2/core/hal/intrin.hpp"
|
|
|
|
|
|
|
|
//=========================================
|
|
|
|
// Declare & Define & Dispatch in one step
|
|
|
|
//=========================================
|
|
|
|
|
|
|
|
// ARITHM_DISPATCHING_ONLY defined by arithm dispatch file
|
|
|
|
|
|
|
|
#undef ARITHM_DECLARATIONS_ONLY
|
|
|
|
#ifdef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
|
|
|
#define ARITHM_DECLARATIONS_ONLY
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#undef ARITHM_DEFINITIONS_ONLY
|
|
|
|
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && !defined(ARITHM_DISPATCHING_ONLY)
|
|
|
|
#define ARITHM_DEFINITIONS_ONLY
|
|
|
|
#endif
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
namespace cv { namespace hal {
|
|
|
|
|
|
|
|
#ifndef ARITHM_DISPATCHING_ONLY
|
|
|
|
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
|
|
|
#endif
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#if (defined ARITHM_DECLARATIONS_ONLY) || (defined ARITHM_DEFINITIONS_ONLY)
|
|
|
|
|
|
|
|
#undef DECLARE_SIMPLE_BINARY_OP
|
|
|
|
#define DECLARE_SIMPLE_BINARY_OP(opname, type) \
|
|
|
|
void opname(const type* src1, size_t step1, const type* src2, size_t step2, \
|
|
|
|
type* dst, size_t step, int width, int height)
|
|
|
|
|
|
|
|
#undef DECLARE_SIMPLE_BINARY_OP_ALLTYPES
|
|
|
|
#define DECLARE_SIMPLE_BINARY_OP_ALLTYPES(opname) \
|
|
|
|
DECLARE_SIMPLE_BINARY_OP(opname##8u, uchar); \
|
|
|
|
DECLARE_SIMPLE_BINARY_OP(opname##8s, schar); \
|
|
|
|
DECLARE_SIMPLE_BINARY_OP(opname##16u, ushort); \
|
|
|
|
DECLARE_SIMPLE_BINARY_OP(opname##16s, short); \
|
|
|
|
DECLARE_SIMPLE_BINARY_OP(opname##32u, unsigned); \
|
|
|
|
DECLARE_SIMPLE_BINARY_OP(opname##32s, int); \
|
|
|
|
DECLARE_SIMPLE_BINARY_OP(opname##64u, uint64); \
|
|
|
|
DECLARE_SIMPLE_BINARY_OP(opname##64s, int64); \
|
2024-04-01 22:09:24 +08:00
|
|
|
DECLARE_SIMPLE_BINARY_OP(opname##16f, hfloat); \
|
2024-03-22 08:45:59 +08:00
|
|
|
DECLARE_SIMPLE_BINARY_OP(opname##16bf, bfloat); \
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DECLARE_SIMPLE_BINARY_OP(opname##32f, float); \
|
|
|
|
DECLARE_SIMPLE_BINARY_OP(opname##64f, double)
|
|
|
|
|
|
|
|
DECLARE_SIMPLE_BINARY_OP_ALLTYPES(add);
|
|
|
|
DECLARE_SIMPLE_BINARY_OP_ALLTYPES(sub);
|
|
|
|
DECLARE_SIMPLE_BINARY_OP_ALLTYPES(max);
|
|
|
|
DECLARE_SIMPLE_BINARY_OP_ALLTYPES(min);
|
|
|
|
DECLARE_SIMPLE_BINARY_OP_ALLTYPES(absdiff);
|
|
|
|
|
|
|
|
void and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
|
|
|
|
void or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
|
|
|
|
void xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
|
|
|
|
void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#undef DECLARE_CMP_OP
|
|
|
|
#define DECLARE_CMP_OP(opname, type) \
|
|
|
|
void opname(const type* src1, size_t step1, const type* src2, size_t step2, \
|
|
|
|
uchar* dst, size_t step, int width, int height, int cmpop)
|
|
|
|
|
|
|
|
DECLARE_CMP_OP(cmp8u, uchar);
|
|
|
|
DECLARE_CMP_OP(cmp8s, schar);
|
|
|
|
DECLARE_CMP_OP(cmp16u, ushort);
|
|
|
|
DECLARE_CMP_OP(cmp16s, short);
|
|
|
|
DECLARE_CMP_OP(cmp32u, unsigned);
|
|
|
|
DECLARE_CMP_OP(cmp32s, int);
|
|
|
|
DECLARE_CMP_OP(cmp64u, uint64);
|
|
|
|
DECLARE_CMP_OP(cmp64s, int64);
|
2024-04-01 22:09:24 +08:00
|
|
|
DECLARE_CMP_OP(cmp16f, hfloat);
|
2024-03-22 08:45:59 +08:00
|
|
|
DECLARE_CMP_OP(cmp16bf, bfloat);
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DECLARE_CMP_OP(cmp32f, float);
|
|
|
|
DECLARE_CMP_OP(cmp64f, double);
|
|
|
|
|
|
|
|
#undef DECLARE_SCALED_BINARY_OP
|
|
|
|
#define DECLARE_SCALED_BINARY_OP(opname, type, scale_arg) \
|
|
|
|
void opname(const type* src1, size_t step1, const type* src2, size_t step2, \
|
|
|
|
type* dst, size_t step, int width, int height, scale_arg)
|
|
|
|
|
|
|
|
#undef DECLARE_SCALED_BINARY_OP_ALLTYPES
|
|
|
|
#define DECLARE_SCALED_BINARY_OP_ALLTYPES(opname, scale_arg) \
|
|
|
|
DECLARE_SCALED_BINARY_OP(opname##8u, uchar, scale_arg); \
|
|
|
|
DECLARE_SCALED_BINARY_OP(opname##8s, schar, scale_arg); \
|
|
|
|
DECLARE_SCALED_BINARY_OP(opname##16u, ushort, scale_arg); \
|
|
|
|
DECLARE_SCALED_BINARY_OP(opname##16s, short, scale_arg); \
|
|
|
|
DECLARE_SCALED_BINARY_OP(opname##32u, unsigned, scale_arg); \
|
|
|
|
DECLARE_SCALED_BINARY_OP(opname##32s, int, scale_arg); \
|
|
|
|
DECLARE_SCALED_BINARY_OP(opname##64u, uint64, scale_arg); \
|
|
|
|
DECLARE_SCALED_BINARY_OP(opname##64s, int64, scale_arg); \
|
2024-04-01 22:09:24 +08:00
|
|
|
DECLARE_SCALED_BINARY_OP(opname##16f, hfloat, scale_arg); \
|
2024-03-22 08:45:59 +08:00
|
|
|
DECLARE_SCALED_BINARY_OP(opname##16bf, bfloat, scale_arg); \
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DECLARE_SCALED_BINARY_OP(opname##32f, float, scale_arg); \
|
|
|
|
DECLARE_SCALED_BINARY_OP(opname##64f, double, scale_arg)
|
|
|
|
|
|
|
|
DECLARE_SCALED_BINARY_OP_ALLTYPES(mul, double);
|
|
|
|
DECLARE_SCALED_BINARY_OP_ALLTYPES(div, double);
|
|
|
|
DECLARE_SCALED_BINARY_OP_ALLTYPES(recip, double);
|
|
|
|
DECLARE_SCALED_BINARY_OP_ALLTYPES(addWeighted, double weights[3]);
|
2018-07-26 04:00:37 +08:00
|
|
|
|
2022-07-20 01:02:00 +08:00
|
|
|
#endif
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#ifdef ARITHM_DEFINITIONS_ONLY
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#define SIMD_ONLY(expr) expr
|
2018-07-26 04:00:37 +08:00
|
|
|
#else
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#define SIMD_ONLY(expr)
|
2018-07-26 04:00:37 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
//=======================================
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
// Arithmetic and logical operations
|
|
|
|
// +, -, *, /, &, |, ^, ~, abs ...
|
2018-07-26 04:00:37 +08:00
|
|
|
//=======================================
|
|
|
|
|
|
|
|
///////////////////////////// Operations //////////////////////////////////
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#undef DEFINE_SIMPLE_BINARY_OP
|
|
|
|
#undef DEFINE_SIMPLE_BINARY_OP_F16
|
|
|
|
#undef DEFINE_SIMPLE_BINARY_OP_NOSIMD
|
|
|
|
|
|
|
|
#define DEFINE_SIMPLE_BINARY_OP(opname, T1, Tvec, scalar_op, vec_op) \
|
|
|
|
void opname(const T1* src1, size_t step1, \
|
|
|
|
const T1* src2, size_t step2, \
|
|
|
|
T1* dst, size_t step, \
|
|
|
|
int width, int height) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
SIMD_ONLY(int simd_width = VTraits<Tvec>::vlanes();) \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
step /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
SIMD_ONLY(for (; x < width; x += simd_width) \
|
|
|
|
{ \
|
|
|
|
if (x + simd_width > width) { \
|
|
|
|
if (((x == 0) | (dst == src1) | (dst == src2)) != 0) \
|
|
|
|
break; \
|
|
|
|
x = width - simd_width; \
|
|
|
|
} \
|
|
|
|
vx_store(dst + x, vec_op(vx_load(src1 + x), vx_load(src2 + x))); \
|
|
|
|
}) \
|
|
|
|
for (; x < width; x++) \
|
|
|
|
dst[x] = saturate_cast<T1>(scalar_op(src1[x], src2[x])); \
|
|
|
|
} \
|
|
|
|
SIMD_ONLY(vx_cleanup();) \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#define DEFINE_SIMPLE_BINARY_OP_16F(opname, T1, scalar_op, vec_op) \
|
|
|
|
void opname(const T1* src1, size_t step1, \
|
|
|
|
const T1* src2, size_t step2, \
|
|
|
|
T1* dst, size_t step, \
|
|
|
|
int width, int height) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
SIMD_ONLY(int simd_width = VTraits<v_float32>::vlanes();) \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
step /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
SIMD_ONLY(for (; x < width; x += simd_width) \
|
|
|
|
{ \
|
|
|
|
if (x + simd_width > width) { \
|
|
|
|
if (((x == 0) | (dst == src1) | (dst == src2)) != 0) \
|
|
|
|
break; \
|
|
|
|
x = width - simd_width; \
|
|
|
|
} \
|
|
|
|
v_pack_store(dst + x, vec_op(vx_load_expand(src1 + x), vx_load_expand(src2 + x))); \
|
|
|
|
}) \
|
|
|
|
for (; x < width; x++) \
|
|
|
|
dst[x] = T1(scalar_op((float)src1[x], (float)src2[x])); \
|
|
|
|
} \
|
|
|
|
SIMD_ONLY(vx_cleanup();) \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#define DEFINE_SIMPLE_BINARY_OP_NOSIMD(opname, T1, worktype, scalar_op) \
|
|
|
|
void opname(const T1* src1, size_t step1, \
|
|
|
|
const T1* src2, size_t step2, \
|
|
|
|
T1* dst, size_t step, \
|
|
|
|
int width, int height) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
step /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
for (int x = 0; x < width; x++) \
|
|
|
|
dst[x] = saturate_cast<T1>(scalar_op((worktype)src1[x], (worktype)src2[x])); \
|
|
|
|
} \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#undef scalar_add
|
|
|
|
#define scalar_add(x, y) ((x) + (y))
|
|
|
|
#undef scalar_sub
|
|
|
|
#define scalar_sub(x, y) ((x) - (y))
|
|
|
|
#undef scalar_sub_u64
|
|
|
|
#define scalar_sub_u64(x, y) ((x) <= (y) ? 0 : (x) - (y))
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#undef DEFINE_SIMPLE_BINARY_OP_64F
|
Merge pull request #24325 from hanliutong:rewrite
Rewrite Universal Intrinsic code: float related part #24325
The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.
The series of PRs is listed below:
#23885 First patch, an example
#23980 Core module
#24058 ImgProc module, part 1
#24132 ImgProc module, part 2
#24166 ImgProc module, part 3
#24301 Features2d and calib3d module
#24324 Gapi module
This patch (hopefully) is the last one in the series.
This patch mainly involves 3 parts
1. Add some modifications related to float (CV_SIMD_64F)
2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`,
then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD`
3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments
- Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....`
- Some blocks can not be rewrited directly. (Not commented in the source code, just listed here)
- ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct)
- ./modules/imgproc/src/color_lab.cpp (Array of vector type)
- ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type)
- ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`)
These algorithms will need to be redesigned to accommodate scalable backends.
### Pull Request Readiness Checklist
See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request
- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
2023-10-05 22:57:25 +08:00
|
|
|
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#define DEFINE_SIMPLE_BINARY_OP_64F(opname, scalar_op, vec_op) \
|
|
|
|
DEFINE_SIMPLE_BINARY_OP(opname, double, v_float64, scalar_op, vec_op)
|
|
|
|
#else
|
|
|
|
#define DEFINE_SIMPLE_BINARY_OP_64F(opname, scalar_op, vec_op) \
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_NOSIMD(opname, double, double, scalar_op)
|
|
|
|
#endif
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#undef DEFINE_SIMPLE_BINARY_OP_ALLTYPES
|
|
|
|
#define DEFINE_SIMPLE_BINARY_OP_ALLTYPES(opname, scalar_op, scalar_op_u64, vec_op) \
|
|
|
|
DEFINE_SIMPLE_BINARY_OP(opname##8u, uchar, v_uint8, scalar_op, vec_op) \
|
|
|
|
DEFINE_SIMPLE_BINARY_OP(opname##8s, schar, v_int8, scalar_op, vec_op) \
|
|
|
|
DEFINE_SIMPLE_BINARY_OP(opname##16u, ushort, v_uint16, scalar_op, vec_op) \
|
|
|
|
DEFINE_SIMPLE_BINARY_OP(opname##16s, short, v_int16, scalar_op, vec_op) \
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_NOSIMD(opname##32u, unsigned, int64, scalar_op) \
|
|
|
|
DEFINE_SIMPLE_BINARY_OP(opname##32s, int, v_int32, scalar_op, vec_op) \
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_NOSIMD(opname##64u, uint64, uint64, scalar_op_u64) \
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_NOSIMD(opname##64s, int64, int64, scalar_op) \
|
2024-04-01 22:09:24 +08:00
|
|
|
DEFINE_SIMPLE_BINARY_OP_16F(opname##16f, hfloat, scalar_op, vec_op) \
|
2024-03-22 08:45:59 +08:00
|
|
|
DEFINE_SIMPLE_BINARY_OP_16F(opname##16bf, bfloat, scalar_op, vec_op) \
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DEFINE_SIMPLE_BINARY_OP(opname##32f, float, v_float32, scalar_op, vec_op) \
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_64F(opname##64f, scalar_op, vec_op)
|
|
|
|
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_ALLTYPES(add, scalar_add, scalar_add, v_add)
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_ALLTYPES(sub, scalar_sub, scalar_sub_u64, v_sub)
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_ALLTYPES(max, std::max, std::max, v_max)
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_ALLTYPES(min, std::min, std::min, v_min)
|
|
|
|
|
|
|
|
#undef scalar_absdiff
|
|
|
|
#define scalar_absdiff(x, y) std::abs((x) - (y))
|
|
|
|
#define scalar_absdiffu(x, y) (std::max((x), (y)) - std::min((x), (y)))
|
|
|
|
|
|
|
|
DEFINE_SIMPLE_BINARY_OP(absdiff8u, uchar, v_uint8, scalar_absdiff, v_absdiff)
|
|
|
|
DEFINE_SIMPLE_BINARY_OP(absdiff8s, schar, v_int8, scalar_absdiff, v_absdiffs)
|
|
|
|
DEFINE_SIMPLE_BINARY_OP(absdiff16u, ushort, v_uint16, scalar_absdiff, v_absdiff)
|
|
|
|
DEFINE_SIMPLE_BINARY_OP(absdiff16s, short, v_int16, scalar_absdiff, v_absdiffs)
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_NOSIMD(absdiff32u, unsigned, unsigned, scalar_absdiffu)
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_NOSIMD(absdiff32s, int, int, scalar_absdiff)
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_NOSIMD(absdiff64u, uint64, uint64, scalar_absdiffu)
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_NOSIMD(absdiff64s, int64, int64, scalar_absdiff)
|
2024-04-01 22:09:24 +08:00
|
|
|
DEFINE_SIMPLE_BINARY_OP_16F(absdiff16f, hfloat, scalar_absdiff, v_absdiff)
|
2024-03-22 08:45:59 +08:00
|
|
|
DEFINE_SIMPLE_BINARY_OP_16F(absdiff16bf, bfloat, scalar_absdiff, v_absdiff)
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DEFINE_SIMPLE_BINARY_OP(absdiff32f, float, v_float32, scalar_absdiff, v_absdiff)
|
|
|
|
DEFINE_SIMPLE_BINARY_OP_64F(absdiff64f, scalar_absdiff, v_absdiff)
|
|
|
|
|
|
|
|
#undef DEFINE_BINARY_LOGIC_OP
|
|
|
|
#define DEFINE_BINARY_LOGIC_OP(opname, scalar_op, vec_op) \
|
|
|
|
void opname(const uchar* src1, size_t step1, \
|
|
|
|
const uchar* src2, size_t step2, \
|
|
|
|
uchar* dst, size_t step, \
|
|
|
|
int width, int height) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
int simd_width = VTraits<v_uint8>::vlanes(); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
for (; x < width; x += simd_width) \
|
|
|
|
{ \
|
|
|
|
if (x + simd_width > width) { \
|
|
|
|
if (((x == 0) | (dst == src1) | (dst == src2)) != 0) \
|
|
|
|
break; \
|
|
|
|
x = width - simd_width; \
|
|
|
|
} \
|
|
|
|
vx_store(dst + x, vec_op(vx_load(src1 + x), vx_load(src2 + x))); \
|
|
|
|
} \
|
|
|
|
for (; x < width; x++) \
|
|
|
|
dst[x] = (uchar)(src1[x] scalar_op src2[x]); \
|
|
|
|
} \
|
|
|
|
vx_cleanup(); \
|
|
|
|
}
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DEFINE_BINARY_LOGIC_OP(and8u, &, v_and)
|
|
|
|
DEFINE_BINARY_LOGIC_OP(or8u, |, v_or)
|
|
|
|
DEFINE_BINARY_LOGIC_OP(xor8u, ^, v_xor)
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
void not8u(const uchar* src1, size_t step1,
|
|
|
|
const uchar*, size_t,
|
|
|
|
uchar* dst, size_t step,
|
|
|
|
int width, int height)
|
2018-07-26 04:00:37 +08:00
|
|
|
{
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
CV_INSTRUMENT_REGION();
|
|
|
|
int simd_width = VTraits<v_uint8>::vlanes();
|
|
|
|
for (; --height >= 0; src1 += step1, dst += step) {
|
2018-07-26 04:00:37 +08:00
|
|
|
int x = 0;
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
for (; x < width; x += simd_width)
|
2018-07-26 04:00:37 +08:00
|
|
|
{
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
if (x + simd_width > width) {
|
|
|
|
if (((x == 0) | (dst == src1)) != 0)
|
|
|
|
break;
|
|
|
|
x = width - simd_width;
|
|
|
|
}
|
|
|
|
vx_store(dst + x, v_not(vx_load(src1 + x)));
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
for (; x < width; x++)
|
|
|
|
dst[x] = (uchar)(~src1[x]);
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
vx_cleanup();
|
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
//=======================================
|
|
|
|
// Compare
|
|
|
|
//=======================================
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#undef DEFINE_CMP_OP_8
|
|
|
|
#undef DEFINE_CMP_OP_16
|
|
|
|
#undef DEFINE_CMP_OP_16F
|
|
|
|
#undef DEFINE_CMP_OP_32
|
|
|
|
#undef DEFINE_CMP_OP_64
|
|
|
|
|
|
|
|
// comparison for 8-bit types
|
|
|
|
#define DEFINE_CMP_OP_8(opname, T1, Tvec, scalar_op, vec_op) \
|
|
|
|
static void opname(const T1* src1, size_t step1, \
|
|
|
|
const T1* src2, size_t step2, \
|
|
|
|
uchar* dst, size_t step, \
|
|
|
|
int width, int height) \
|
|
|
|
{ \
|
|
|
|
SIMD_ONLY(int simd_width = VTraits<Tvec>::vlanes();) \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
SIMD_ONLY(for (; x < width; x += simd_width) \
|
|
|
|
{ \
|
|
|
|
if (x + simd_width > width) { \
|
|
|
|
if (((x == 0) | (dst == (uchar*)src1) | (dst == (uchar*)src2)) != 0) \
|
|
|
|
break; \
|
|
|
|
x = width - simd_width; \
|
|
|
|
} \
|
|
|
|
vx_store((T1*)(dst + x), vec_op(vx_load(src1 + x), vx_load(src2 + x))); \
|
|
|
|
}) \
|
|
|
|
for (; x < width; x++) \
|
|
|
|
dst[x] = (uchar)-(int)(src1[x] scalar_op src2[x]); \
|
|
|
|
} \
|
|
|
|
SIMD_ONLY(vx_cleanup();) \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
// comparison for 16-bit integer types
|
|
|
|
#define DEFINE_CMP_OP_16(opname, T1, Tvec, scalar_op, vec_op) \
|
|
|
|
static void opname(const T1* src1, size_t step1, \
|
|
|
|
const T1* src2, size_t step2, \
|
|
|
|
uchar* dst, size_t step, \
|
|
|
|
int width, int height) \
|
|
|
|
{ \
|
|
|
|
SIMD_ONLY(int simd_width = VTraits<Tvec>::vlanes();) \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
SIMD_ONLY(for (; x < width; x += simd_width) \
|
|
|
|
{ \
|
|
|
|
if (x + simd_width > width) { \
|
|
|
|
if (x == 0) \
|
|
|
|
break; \
|
|
|
|
x = width - simd_width; \
|
|
|
|
} \
|
|
|
|
v_pack_store((schar*)(dst + x), v_reinterpret_as_s16(vec_op(vx_load(src1 + x), vx_load(src2 + x)))); \
|
|
|
|
}) \
|
|
|
|
for (; x < width; x++) \
|
|
|
|
dst[x] = (uchar)-(int)(src1[x] scalar_op src2[x]); \
|
|
|
|
} \
|
|
|
|
SIMD_ONLY(vx_cleanup();) \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
// comparison for 16-bit floating-point types
|
|
|
|
#define DEFINE_CMP_OP_16F(opname, T1, scalar_op, vec_op) \
|
|
|
|
static void opname(const T1* src1, size_t step1, \
|
|
|
|
const T1* src2, size_t step2, \
|
|
|
|
uchar* dst, size_t step, \
|
|
|
|
int width, int height) \
|
|
|
|
{ \
|
|
|
|
SIMD_ONLY(int simd_width = VTraits<v_float32>::vlanes();) \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
SIMD_ONLY(for (; x < width; x += simd_width*2) \
|
|
|
|
{ \
|
|
|
|
if (x + simd_width*2 > width) { \
|
|
|
|
if (x == 0) \
|
|
|
|
break; \
|
|
|
|
x = width - simd_width*2; \
|
|
|
|
} \
|
|
|
|
auto mask0 = v_reinterpret_as_s32(vec_op(vx_load_expand(src1 + x), \
|
|
|
|
vx_load_expand(src2 + x))); \
|
|
|
|
auto mask1 = v_reinterpret_as_s32(vec_op(vx_load_expand(src1 + x + simd_width), \
|
|
|
|
vx_load_expand(src2 + x + simd_width))); \
|
|
|
|
auto mask = v_pack(mask0, mask1); \
|
|
|
|
v_pack_store((schar*)(dst + x), mask); \
|
|
|
|
}) \
|
|
|
|
for (; x < width; x++) \
|
|
|
|
dst[x] = (uchar)-(int)((float)src1[x] scalar_op (float)src2[x]); \
|
|
|
|
} \
|
|
|
|
SIMD_ONLY(vx_cleanup();) \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
// comparison for 32-bit types
|
|
|
|
#define DEFINE_CMP_OP_32(opname, T1, Tvec, scalar_op, vec_op) \
|
|
|
|
static void opname(const T1* src1, size_t step1, \
|
|
|
|
const T1* src2, size_t step2, \
|
|
|
|
uchar* dst, size_t step, \
|
|
|
|
int width, int height) \
|
|
|
|
{ \
|
|
|
|
SIMD_ONLY(int simd_width = VTraits<Tvec>::vlanes();) \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
SIMD_ONLY(for (; x < width; x += simd_width*2) \
|
|
|
|
{ \
|
|
|
|
if (x + simd_width*2 > width) { \
|
|
|
|
if (x == 0) \
|
|
|
|
break; \
|
|
|
|
x = width - simd_width*2; \
|
|
|
|
} \
|
|
|
|
auto mask0 = v_reinterpret_as_s32(vec_op(vx_load(src1 + x), \
|
|
|
|
vx_load(src2 + x))); \
|
|
|
|
auto mask1 = v_reinterpret_as_s32(vec_op(vx_load(src1 + x + simd_width), \
|
|
|
|
vx_load(src2 + x + simd_width))); \
|
|
|
|
auto mask = v_pack(mask0, mask1); \
|
|
|
|
v_pack_store((schar*)(dst + x), mask); \
|
|
|
|
}) \
|
|
|
|
for (; x < width; x++) \
|
|
|
|
dst[x] = (uchar)-(int)(src1[x] scalar_op src2[x]); \
|
|
|
|
} \
|
|
|
|
SIMD_ONLY(vx_cleanup();) \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
// comparison for 64-bit types; don't bother with SIMD here. Hope, compiler will do it
|
|
|
|
#define DEFINE_CMP_OP_64(opname, T1, scalar_op) \
|
|
|
|
static void opname(const T1* src1, size_t step1, \
|
|
|
|
const T1* src2, size_t step2, \
|
|
|
|
uchar* dst, size_t step, \
|
|
|
|
int width, int height) \
|
|
|
|
{ \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
for (int x = 0; x < width; x++) \
|
|
|
|
dst[x] = (uchar)-(int)(src1[x] scalar_op src2[x]); \
|
|
|
|
} \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#undef DEFINE_CMP_OP_ALLTYPES
|
|
|
|
#define DEFINE_CMP_OP_ALLTYPES(opname, scalar_op, vec_op) \
|
|
|
|
DEFINE_CMP_OP_8(opname##8u, uchar, v_uint8, scalar_op, vec_op) \
|
|
|
|
DEFINE_CMP_OP_8(opname##8s, schar, v_int8, scalar_op, vec_op) \
|
|
|
|
DEFINE_CMP_OP_16(opname##16u, ushort, v_uint16, scalar_op, vec_op) \
|
|
|
|
DEFINE_CMP_OP_16(opname##16s, short, v_int16, scalar_op, vec_op) \
|
|
|
|
DEFINE_CMP_OP_32(opname##32u, unsigned, v_uint32, scalar_op, vec_op) \
|
|
|
|
DEFINE_CMP_OP_32(opname##32s, int, v_int32, scalar_op, vec_op) \
|
|
|
|
DEFINE_CMP_OP_64(opname##64u, uint64, scalar_op) \
|
|
|
|
DEFINE_CMP_OP_64(opname##64s, int64, scalar_op) \
|
2024-04-01 22:09:24 +08:00
|
|
|
DEFINE_CMP_OP_16F(opname##16f, hfloat, scalar_op, vec_op) \
|
2024-03-22 08:45:59 +08:00
|
|
|
DEFINE_CMP_OP_16F(opname##16bf, bfloat, scalar_op, vec_op) \
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DEFINE_CMP_OP_32(opname##32f, float, v_float32, scalar_op, vec_op) \
|
|
|
|
DEFINE_CMP_OP_64(opname##64f, double, scalar_op)
|
|
|
|
|
|
|
|
DEFINE_CMP_OP_ALLTYPES(cmpeq, ==, v_eq)
|
|
|
|
DEFINE_CMP_OP_ALLTYPES(cmpne, !=, v_ne)
|
|
|
|
DEFINE_CMP_OP_ALLTYPES(cmplt, <, v_lt)
|
|
|
|
DEFINE_CMP_OP_ALLTYPES(cmple, <=, v_le)
|
|
|
|
|
|
|
|
#undef DEFINE_CMP_OP
|
|
|
|
#define DEFINE_CMP_OP(suffix, type) \
|
|
|
|
void cmp##suffix(const type* src1, size_t step1, const type* src2, size_t step2, \
|
|
|
|
uchar* dst, size_t step, int width, int height, int cmpop) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
switch(cmpop) \
|
|
|
|
{ \
|
|
|
|
case CMP_LT: \
|
|
|
|
cmplt##suffix(src1, step1, src2, step2, dst, step, width, height); \
|
|
|
|
break; \
|
|
|
|
case CMP_GT: \
|
|
|
|
cmplt##suffix(src2, step2, src1, step1, dst, step, width, height); \
|
|
|
|
break; \
|
|
|
|
case CMP_LE: \
|
|
|
|
cmple##suffix(src1, step1, src2, step2, dst, step, width, height); \
|
|
|
|
break; \
|
|
|
|
case CMP_GE: \
|
|
|
|
cmple##suffix(src2, step2, src1, step1, dst, step, width, height); \
|
|
|
|
break; \
|
|
|
|
case CMP_EQ: \
|
|
|
|
cmpeq##suffix(src1, step1, src2, step2, dst, step, width, height); \
|
|
|
|
break; \
|
|
|
|
default: \
|
|
|
|
CV_Assert(cmpop == CMP_NE); \
|
|
|
|
cmpne##suffix(src1, step1, src2, step2, dst, step, width, height); \
|
|
|
|
} \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DEFINE_CMP_OP(8u, uchar)
|
|
|
|
DEFINE_CMP_OP(8s, schar)
|
|
|
|
DEFINE_CMP_OP(16u, ushort)
|
|
|
|
DEFINE_CMP_OP(16s, short)
|
|
|
|
DEFINE_CMP_OP(32u, unsigned)
|
|
|
|
DEFINE_CMP_OP(32s, int)
|
|
|
|
DEFINE_CMP_OP(64u, uint64)
|
|
|
|
DEFINE_CMP_OP(64s, int64)
|
2024-04-01 22:09:24 +08:00
|
|
|
DEFINE_CMP_OP(16f, hfloat)
|
2024-03-22 08:45:59 +08:00
|
|
|
DEFINE_CMP_OP(16bf, bfloat)
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DEFINE_CMP_OP(32f, float)
|
|
|
|
DEFINE_CMP_OP(64f, double)
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
//=======================================
|
|
|
|
// Mul, Div, Recip, AddWeighted
|
|
|
|
//=======================================
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#undef DEFINE_SCALED_OP_8
|
|
|
|
#undef DEFINE_SCALED_OP_16
|
|
|
|
#undef DEFINE_SCALED_OP_16F
|
|
|
|
#undef DEFINE_SCALED_OP_32
|
|
|
|
#undef DEFINE_SCALED_OP_64
|
|
|
|
|
|
|
|
#define DEFINE_SCALED_OP_8(opname, scale_arg, T1, Tvec, scalar_op, vec_op, init, pack_store_op, when_binary) \
|
|
|
|
void opname(const T1* src1, size_t step1, const T1* src2, size_t step2, \
|
|
|
|
T1* dst, size_t step, int width, int height, scale_arg) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
init(); \
|
|
|
|
SIMD_ONLY(int simd_width = VTraits<Tvec>::vlanes()>>1;) \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
step /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
SIMD_ONLY(for (; x < width; x += simd_width) \
|
|
|
|
{ \
|
|
|
|
if (x + simd_width > width) { \
|
|
|
|
if (((x == 0) | (dst == src1) | (dst == src2)) != 0) \
|
|
|
|
break; \
|
|
|
|
x = width - simd_width; \
|
|
|
|
} \
|
|
|
|
v_int16 i1 = v_reinterpret_as_s16(vx_load_expand(src1 + x)); \
|
|
|
|
when_binary(v_int16 i2 = v_reinterpret_as_s16(vx_load_expand(src2 + x))); \
|
|
|
|
v_float32 f1 = v_cvt_f32(v_expand_low(i1)); \
|
|
|
|
when_binary(v_float32 f2 = v_cvt_f32(v_expand_low(i2))); \
|
|
|
|
v_float32 g1 = vec_op(); \
|
|
|
|
f1 = v_cvt_f32(v_expand_high(i1)); \
|
|
|
|
when_binary(f2 = v_cvt_f32(v_expand_high(i2))); \
|
|
|
|
v_float32 g2 = vec_op(); \
|
|
|
|
i1 = v_pack(v_round(g1), v_round(g2)); \
|
|
|
|
pack_store_op(dst + x, i1); \
|
|
|
|
}) \
|
|
|
|
for (; x < width; x++) { \
|
|
|
|
float f1 = (float)src1[x]; \
|
|
|
|
when_binary(float f2 = (float)src2[x]); \
|
|
|
|
dst[x] = saturate_cast<T1>(scalar_op()); \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
SIMD_ONLY(vx_cleanup();) \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#define DEFINE_SCALED_OP_16(opname, scale_arg, T1, Tvec, scalar_op, vec_op, init, pack_store_op, when_binary) \
|
|
|
|
void opname(const T1* src1, size_t step1, const T1* src2, size_t step2, \
|
|
|
|
T1* dst, size_t step, int width, int height, scale_arg) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
init() \
|
|
|
|
SIMD_ONLY(int simd_width = VTraits<Tvec>::vlanes()>>1;) \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
step /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
SIMD_ONLY(for (; x < width; x += simd_width) \
|
|
|
|
{ \
|
|
|
|
if (x + simd_width > width) { \
|
|
|
|
if (((x == 0) | (dst == src1) | (dst == src2)) != 0) \
|
|
|
|
break; \
|
|
|
|
x = width - simd_width; \
|
|
|
|
} \
|
|
|
|
v_int32 i1 = v_reinterpret_as_s32(vx_load_expand(src1 + x)); \
|
|
|
|
when_binary(v_int32 i2 = v_reinterpret_as_s32(vx_load_expand(src2 + x))); \
|
|
|
|
v_float32 f1 = v_cvt_f32(i1); \
|
|
|
|
when_binary(v_float32 f2 = v_cvt_f32(i2)); \
|
|
|
|
f1 = vec_op(); \
|
|
|
|
i1 = v_round(f1); \
|
|
|
|
pack_store_op(dst + x, i1); \
|
|
|
|
}) \
|
|
|
|
for (; x < width; x++) { \
|
|
|
|
float f1 = (float)src1[x]; \
|
|
|
|
when_binary(float f2 = (float)src2[x]); \
|
|
|
|
dst[x] = saturate_cast<T1>(scalar_op()); \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
SIMD_ONLY(vx_cleanup();) \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#define DEFINE_SCALED_OP_16F(opname, scale_arg, T1, scalar_op, vec_op, init, when_binary) \
|
|
|
|
void opname(const T1* src1, size_t step1, const T1* src2, size_t step2, \
|
|
|
|
T1* dst, size_t step, int width, int height, scale_arg) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
init() \
|
|
|
|
SIMD_ONLY(int simd_width = VTraits<v_float32>::vlanes();) \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
step /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
SIMD_ONLY(for (; x < width; x += simd_width) \
|
|
|
|
{ \
|
|
|
|
if (x + simd_width > width) { \
|
|
|
|
if (((x == 0) | (dst == src1) | (dst == src2)) != 0) \
|
|
|
|
break; \
|
|
|
|
x = width - simd_width; \
|
|
|
|
} \
|
|
|
|
v_float32 f1 = vx_load_expand(src1 + x); \
|
|
|
|
when_binary(v_float32 f2 = vx_load_expand(src2 + x)); \
|
|
|
|
f1 = vec_op(); \
|
|
|
|
v_pack_store(dst + x, f1); \
|
|
|
|
}) \
|
|
|
|
for (; x < width; x++) { \
|
|
|
|
float f1 = (float)src1[x]; \
|
|
|
|
when_binary(float f2 = (float)src2[x]); \
|
|
|
|
dst[x] = saturate_cast<T1>(scalar_op()); \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
SIMD_ONLY(vx_cleanup();) \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#define DEFINE_SCALED_OP_32(opname, scale_arg, T1, Tvec, scalar_op, vec_op, init, load_op, store_op, when_binary) \
|
|
|
|
void opname(const T1* src1, size_t step1, const T1* src2, size_t step2, \
|
|
|
|
T1* dst, size_t step, int width, int height, scale_arg) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
init() \
|
|
|
|
SIMD_ONLY(int simd_width = VTraits<Tvec>::vlanes();) \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
step /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
SIMD_ONLY(for (; x < width; x += simd_width) \
|
|
|
|
{ \
|
|
|
|
if (x + simd_width > width) { \
|
|
|
|
if (((x == 0) | (dst == src1) | (dst == src2)) != 0) \
|
|
|
|
break; \
|
|
|
|
x = width - simd_width; \
|
|
|
|
} \
|
|
|
|
v_float32 f1 = load_op(src1 + x); \
|
|
|
|
when_binary(v_float32 f2 = load_op(src2 + x)); \
|
|
|
|
f1 = vec_op(); \
|
|
|
|
store_op(dst + x, f1); \
|
|
|
|
}) \
|
|
|
|
for (; x < width; x++) { \
|
|
|
|
float f1 = (float)src1[x]; \
|
|
|
|
when_binary(float f2 = (float)src2[x]); \
|
|
|
|
dst[x] = saturate_cast<T1>(scalar_op()); \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
SIMD_ONLY(vx_cleanup();) \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#define DEFINE_SCALED_OP_64F_(opname, scale_arg, T1, Tvec, scalar_op, vec_op, init, when_binary) \
|
|
|
|
void opname(const T1* src1, size_t step1, const T1* src2, size_t step2, \
|
|
|
|
T1* dst, size_t step, int width, int height, scale_arg) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
init() \
|
|
|
|
SIMD_ONLY(int simd_width = VTraits<Tvec>::vlanes();) \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
step /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
SIMD_ONLY(for (; x < width; x += simd_width) \
|
|
|
|
{ \
|
|
|
|
if (x + simd_width > width) { \
|
|
|
|
if (((x == 0) | (dst == src1) | (dst == src2)) != 0) \
|
|
|
|
break; \
|
|
|
|
x = width - simd_width; \
|
|
|
|
} \
|
|
|
|
v_float64 f1 = vx_load(src1 + x); \
|
|
|
|
when_binary(v_float64 f2 = vx_load(src2 + x)); \
|
|
|
|
f1 = vec_op(); \
|
|
|
|
v_store(dst + x, f1); \
|
|
|
|
}) \
|
|
|
|
for (; x < width; x++) { \
|
|
|
|
double f1 = (double)src1[x]; \
|
|
|
|
when_binary(double f2 = (double)src2[x]); \
|
|
|
|
dst[x] = saturate_cast<T1>(scalar_op()); \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
SIMD_ONLY(vx_cleanup();) \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#define DEFINE_SCALED_OP_NOSIMD(opname, scale_arg, T1, worktype, scalar_op, init, when_binary) \
|
|
|
|
void opname(const T1* src1, size_t step1, const T1* src2, size_t step2, \
|
|
|
|
T1* dst, size_t step, int width, int height, scale_arg) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
init() \
|
|
|
|
step1 /= sizeof(T1); \
|
|
|
|
step2 /= sizeof(T1); \
|
|
|
|
step /= sizeof(T1); \
|
|
|
|
for (; --height >= 0; src1 += step1, src2 += step2, dst += step) { \
|
|
|
|
int x = 0; \
|
|
|
|
for (; x < width; x++) { \
|
|
|
|
worktype f1 = (worktype)src1[x]; \
|
|
|
|
when_binary(worktype f2 = (worktype)src2[x]); \
|
|
|
|
dst[x] = saturate_cast<T1>(scalar_op()); \
|
|
|
|
} \
|
|
|
|
} \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#define init_muldiv_f32() \
|
|
|
|
float sscale = (float)scale; \
|
|
|
|
SIMD_ONLY(v_float32 vzero = vx_setzero_f32(); \
|
|
|
|
v_float32 vscale = v_add(vx_setall_f32(sscale), vzero);)
|
|
|
|
#define init_addw_f32() \
|
|
|
|
float sw1 = (float)weights[0]; \
|
|
|
|
float sw2 = (float)weights[1]; \
|
|
|
|
float sdelta = (float)weights[2];\
|
|
|
|
SIMD_ONLY(v_float32 vw1 = vx_setall_f32(sw1); \
|
|
|
|
v_float32 vw2 = vx_setall_f32(sw2); \
|
|
|
|
v_float32 vdelta = vx_setall_f32(sdelta);)
|
|
|
|
|
|
|
|
#undef init_muldiv_nosimd_f32
|
|
|
|
#define init_muldiv_nosimd_f32() \
|
|
|
|
float sscale = (float)scale;
|
|
|
|
#undef init_addw_nosimd_f32
|
|
|
|
#define init_addw_nosimd_f32() \
|
|
|
|
float sw1 = (float)weights[0]; \
|
|
|
|
float sw2 = (float)weights[1]; \
|
|
|
|
float sdelta = (float)weights[2];
|
|
|
|
|
|
|
|
#undef init_muldiv_nosimd_f64
|
|
|
|
#undef init_addw_nosimd_f64
|
|
|
|
#define init_muldiv_nosimd_f64() \
|
|
|
|
double sscale = scale;
|
|
|
|
#define init_addw_nosimd_f64() \
|
|
|
|
double sw1 = weights[0]; \
|
|
|
|
double sw2 = weights[1]; \
|
|
|
|
double sdelta = weights[2];
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
|
|
|
#define DEFINE_SCALED_OP_64F(opname, scale_arg, scalar_op, vec_op, init, when_binary) \
|
|
|
|
DEFINE_SCALED_OP_64F_(opname, scale_arg, double, v_float64, scalar_op, vec_op, init, when_binary)
|
|
|
|
#define init_muldiv_f64() \
|
|
|
|
double sscale = (double)scale; \
|
|
|
|
SIMD_ONLY(v_float64 vzero = vx_setzero_f64(); \
|
|
|
|
v_float64 vscale = v_add(vx_setall_f64(sscale), vzero);)
|
|
|
|
#define init_addw_f64() \
|
|
|
|
double sw1 = weights[0]; \
|
|
|
|
double sw2 = weights[1]; \
|
|
|
|
double sdelta = weights[2];\
|
|
|
|
SIMD_ONLY(v_float64 vw1 = vx_setall_f64(sw1); \
|
|
|
|
v_float64 vw2 = vx_setall_f64(sw2); \
|
|
|
|
v_float64 vdelta = vx_setall_f64(sdelta);)
|
|
|
|
#else
|
|
|
|
#define DEFINE_SCALED_OP_64F(opname, scale_arg, scalar_op, vec_op, init, when_binary) \
|
|
|
|
DEFINE_SCALED_OP_NOSIMD(opname, scale_arg, double, double, scalar_op, init, when_binary)
|
|
|
|
#define init_muldiv_f64() init_muldiv_nosimd_f64()
|
|
|
|
#define init_addw_f64() init_addw_nosimd_f64()
|
2022-07-20 01:02:00 +08:00
|
|
|
#endif
|
2018-11-11 04:10:57 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#undef scalar_mul
|
|
|
|
#undef vec_mul
|
|
|
|
#undef iscalar_div
|
|
|
|
#undef ivec_div
|
|
|
|
#undef fscalar_div
|
|
|
|
#undef fvec_div
|
|
|
|
#undef scalar_addw
|
|
|
|
#undef vec_addw
|
|
|
|
#define scalar_mul() ((f1)*(f2)*sscale)
|
|
|
|
#define vec_mul() v_mul(v_mul((f1), vscale), (f2))
|
|
|
|
#define iscalar_div() ((f2)!=0? (f1)*sscale/(f2) : 0)
|
|
|
|
#define ivec_div() v_select(v_eq((f2), vzero), vzero, v_div(v_mul((f1), vscale), (f2)))
|
|
|
|
#define fscalar_div() ((f1)*sscale/(f2))
|
|
|
|
#define fvec_div() v_div(v_mul((f1), vscale), (f2))
|
|
|
|
#define iscalar_recip() ((f1)!=0? sscale/(f1) : 0)
|
|
|
|
#define ivec_recip() v_select(v_eq((f1), vzero), vzero, v_div(vscale, (f1)))
|
|
|
|
#define fscalar_recip() (sscale/(f1))
|
|
|
|
#define fvec_recip() v_div(vscale, (f1))
|
|
|
|
#define scalar_addw() ((f1)*sw1 + (f2)*sw2 + sdelta)
|
|
|
|
#define vec_addw() v_fma((f1), vw1, v_fma((f2), vw2, vdelta))
|
|
|
|
#undef load_as_f32
|
|
|
|
#undef store_as_s32
|
|
|
|
#define load_as_f32(addr) v_cvt_f32(vx_load(addr))
|
|
|
|
#define store_as_s32(addr, x) v_store((addr), v_round(x))
|
|
|
|
|
|
|
|
#undef this_is_binary
|
|
|
|
#undef this_is_unary
|
|
|
|
#define this_is_binary(expr) expr
|
|
|
|
#define this_is_unary(expr)
|
|
|
|
|
|
|
|
#undef DEFINE_SCALED_OP_ALLTYPES
|
|
|
|
#define DEFINE_SCALED_OP_ALLTYPES(opname, scale_arg, iscalar_op, fscalar_op, ivec_op, fvec_op, init, when_binary) \
|
|
|
|
DEFINE_SCALED_OP_8(opname##8u, scale_arg, uchar, v_uint8, iscalar_op, ivec_op, init##_f32, v_pack_u_store, when_binary) \
|
|
|
|
DEFINE_SCALED_OP_8(opname##8s, scale_arg, schar, v_int8, iscalar_op, ivec_op, init##_f32, v_pack_store, when_binary) \
|
|
|
|
DEFINE_SCALED_OP_16(opname##16u, scale_arg, ushort, v_uint16, iscalar_op, ivec_op, init##_f32, v_pack_u_store, when_binary) \
|
|
|
|
DEFINE_SCALED_OP_16(opname##16s, scale_arg, short, v_int16, iscalar_op, ivec_op, init##_f32, v_pack_store, when_binary) \
|
|
|
|
DEFINE_SCALED_OP_NOSIMD(opname##32u, scale_arg, unsigned, double, iscalar_op, init##_nosimd_f64, when_binary) \
|
|
|
|
DEFINE_SCALED_OP_NOSIMD(opname##32s, scale_arg, int, double, iscalar_op, init##_nosimd_f64, when_binary) \
|
|
|
|
DEFINE_SCALED_OP_NOSIMD(opname##64u, scale_arg, uint64, double, iscalar_op, init##_nosimd_f64, when_binary) \
|
|
|
|
DEFINE_SCALED_OP_NOSIMD(opname##64s, scale_arg, int64, double, iscalar_op, init##_nosimd_f64, when_binary) \
|
|
|
|
DEFINE_SCALED_OP_32(opname##32f, scale_arg, float, v_float32, fscalar_op, fvec_op, init##_f32, vx_load, v_store, when_binary) \
|
|
|
|
DEFINE_SCALED_OP_64F(opname##64f, scale_arg, fscalar_op, fvec_op, init##_f64, when_binary) \
|
2024-04-01 22:09:24 +08:00
|
|
|
DEFINE_SCALED_OP_16F(opname##16f, scale_arg, hfloat, fscalar_op, fvec_op, init##_f32, when_binary) \
|
2024-03-22 08:45:59 +08:00
|
|
|
DEFINE_SCALED_OP_16F(opname##16bf, scale_arg, bfloat, fscalar_op, fvec_op, init##_f32, when_binary)
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
|
|
|
|
|
|
|
|
DEFINE_SCALED_OP_ALLTYPES(mul, double scale, scalar_mul, scalar_mul, vec_mul, vec_mul, init_muldiv, this_is_binary)
|
|
|
|
DEFINE_SCALED_OP_ALLTYPES(div, double scale, iscalar_div, fscalar_div, ivec_div, fvec_div, init_muldiv, this_is_binary)
|
|
|
|
DEFINE_SCALED_OP_ALLTYPES(addWeighted, double weights[3], scalar_addw, scalar_addw, vec_addw, vec_addw, init_addw, this_is_binary)
|
|
|
|
DEFINE_SCALED_OP_ALLTYPES(recip, double scale, iscalar_recip, fscalar_recip, ivec_recip, fvec_recip, init_muldiv, this_is_unary)
|
2018-07-26 04:00:37 +08:00
|
|
|
|
|
|
|
#endif
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#ifdef ARITHM_DISPATCHING_ONLY
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#undef DEFINE_BINARY_OP_DISPATCHER
|
|
|
|
#define DEFINE_BINARY_OP_DISPATCHER(opname, decl_type, type) \
|
|
|
|
void opname(const decl_type* src1, size_t step1, const decl_type* src2, size_t step2, \
|
|
|
|
decl_type* dst, size_t step, int width, int height, void*) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
CALL_HAL(opname, cv_hal_##opname, src1, step1, src2, step2, dst, step, width, height) \
|
|
|
|
CV_CPU_DISPATCH(opname, ((const type*)src1, step1, (const type*)src2, step2, \
|
|
|
|
(type*)dst, step, width, height), CV_CPU_DISPATCH_MODES_ALL); \
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#define DEFINE_BINARY_OP_DISPATCHER_ALLTYPES(opname) \
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER(opname##8u, uchar, uchar) \
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER(opname##8s, schar, schar) \
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER(opname##16u, ushort, ushort) \
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER(opname##16s, short, short) \
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER(opname##32u, unsigned, unsigned) \
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER(opname##32s, int, int) \
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER(opname##64u, uint64, uint64) \
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER(opname##64s, int64, int64) \
|
2024-04-01 22:09:24 +08:00
|
|
|
DEFINE_BINARY_OP_DISPATCHER(opname##16f, cv_hal_f16, hfloat) \
|
2024-03-22 08:45:59 +08:00
|
|
|
DEFINE_BINARY_OP_DISPATCHER(opname##16bf, cv_hal_bf16, bfloat) \
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DEFINE_BINARY_OP_DISPATCHER(opname##32f, float, float) \
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER(opname##64f, double, double)
|
|
|
|
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER_ALLTYPES(add)
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER_ALLTYPES(sub)
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER_ALLTYPES(max)
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER_ALLTYPES(min)
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER_ALLTYPES(absdiff)
|
|
|
|
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER(and8u, uchar, uchar)
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER(or8u, uchar, uchar)
|
|
|
|
DEFINE_BINARY_OP_DISPATCHER(xor8u, uchar, uchar)
|
|
|
|
|
|
|
|
void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
|
|
|
uchar* dst, size_t step, int width, int height, void*)
|
2018-07-26 04:00:37 +08:00
|
|
|
{
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
CV_INSTRUMENT_REGION();
|
|
|
|
CALL_HAL(not8u, cv_hal_not8u, src1, step1, dst, step, width, height)
|
|
|
|
CV_CPU_DISPATCH(not8u, (src1, step1, src2, step2, dst, step, width, height), CV_CPU_DISPATCH_MODES_ALL);
|
2018-07-26 04:00:37 +08:00
|
|
|
}
|
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#undef DEFINE_CMP_OP_DISPATCHER
|
|
|
|
#define DEFINE_CMP_OP_DISPATCHER(opname, decl_type, type) \
|
|
|
|
void opname(const decl_type* src1, size_t step1, const decl_type* src2, size_t step2, \
|
|
|
|
uchar* dst, size_t step, int width, int height, void* params) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
CV_CPU_DISPATCH(opname, ((const type*)src1, step1, (const type*)src2, step2, \
|
|
|
|
dst, step, width, height, *(int*)params), CV_CPU_DISPATCH_MODES_ALL); \
|
|
|
|
}
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DEFINE_CMP_OP_DISPATCHER(cmp8u, uchar, uchar)
|
|
|
|
DEFINE_CMP_OP_DISPATCHER(cmp8s, schar, schar)
|
|
|
|
DEFINE_CMP_OP_DISPATCHER(cmp16u, ushort, ushort)
|
|
|
|
DEFINE_CMP_OP_DISPATCHER(cmp16s, short, short)
|
|
|
|
DEFINE_CMP_OP_DISPATCHER(cmp32u, unsigned, unsigned)
|
|
|
|
DEFINE_CMP_OP_DISPATCHER(cmp32s, int, int)
|
|
|
|
DEFINE_CMP_OP_DISPATCHER(cmp64u, uint64, uint64)
|
|
|
|
DEFINE_CMP_OP_DISPATCHER(cmp64s, int64, int64)
|
2024-04-01 22:09:24 +08:00
|
|
|
DEFINE_CMP_OP_DISPATCHER(cmp16f, cv_hal_f16, hfloat)
|
2024-03-22 08:45:59 +08:00
|
|
|
DEFINE_CMP_OP_DISPATCHER(cmp16bf, cv_hal_bf16, bfloat)
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DEFINE_CMP_OP_DISPATCHER(cmp32f, float, float)
|
|
|
|
DEFINE_CMP_OP_DISPATCHER(cmp64f, double, double)
|
|
|
|
|
|
|
|
#undef DEFINE_BINARY_OP_W_PARAMS_DISPATCHER
|
|
|
|
#define DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname, decl_type, type, read_params, paramname) \
|
|
|
|
void opname(const decl_type* src1, size_t step1, const decl_type* src2, size_t step2, \
|
|
|
|
decl_type* dst, size_t step, int width, int height, void* params_) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
read_params; \
|
|
|
|
CALL_HAL(opname, cv_hal_##opname, src1, step1, src2, step2, dst, step, width, height, paramname) \
|
|
|
|
CV_CPU_DISPATCH(opname, ((const type*)src1, step1, (const type*)src2, step2, \
|
|
|
|
(type*)dst, step, width, height, paramname), CV_CPU_DISPATCH_MODES_ALL); \
|
|
|
|
}
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#undef DEFINE_BINARY_OP_W_PARAMS_DISPATCHER_ALLTYPES
|
|
|
|
#define DEFINE_BINARY_OP_W_PARAMS_DISPATCHER_ALLTYPES(opname, read_params, paramname) \
|
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname##8u, uchar, uchar, read_params, paramname) \
|
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname##8s, schar, schar, read_params, paramname) \
|
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname##16u, ushort, ushort, read_params, paramname) \
|
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname##16s, short, short, read_params, paramname) \
|
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname##32u, unsigned, unsigned, read_params, paramname) \
|
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname##32s, int, int, read_params, paramname) \
|
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname##64u, uint64, uint64, read_params, paramname) \
|
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname##64s, int64, int64, read_params, paramname) \
|
2024-04-01 22:09:24 +08:00
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname##16f, cv_hal_f16, hfloat, read_params, paramname) \
|
2024-03-22 08:45:59 +08:00
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname##16bf, cv_hal_bf16, bfloat, read_params, paramname) \
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname##32f, float, float, read_params, paramname) \
|
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER(opname##64f, double, double, read_params, paramname)
|
|
|
|
|
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER_ALLTYPES(mul, double scale = *(double*)params_, scale)
|
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER_ALLTYPES(div, double scale = *(double*)params_, scale)
|
|
|
|
DEFINE_BINARY_OP_W_PARAMS_DISPATCHER_ALLTYPES(addWeighted, \
|
|
|
|
double w[3]; \
|
|
|
|
w[0]=((double*)params_)[0]; \
|
|
|
|
w[1]=((double*)params_)[1]; \
|
|
|
|
w[2]=((double*)params_)[2];, \
|
|
|
|
w)
|
|
|
|
|
|
|
|
#undef DEFINE_UNARY_OP_W_PARAMS_DISPATCHER
|
|
|
|
#define DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(opname, decl_type, type, read_params, paramname) \
|
|
|
|
void opname(const decl_type* src1, size_t step1, const decl_type*, size_t, \
|
|
|
|
decl_type* dst, size_t step, int width, int height, void* params_) \
|
|
|
|
{ \
|
|
|
|
CV_INSTRUMENT_REGION(); \
|
|
|
|
read_params; \
|
|
|
|
CALL_HAL(opname, cv_hal_##opname, src1, step1, dst, step, width, height, paramname) \
|
|
|
|
CV_CPU_DISPATCH(opname, ((const type*)src1, step1, nullptr, 0, \
|
|
|
|
(type*)dst, step, width, height, paramname), CV_CPU_DISPATCH_MODES_ALL); \
|
|
|
|
}
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(recip8u, uchar, uchar, double scale = *(double*)params_, scale)
|
|
|
|
DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(recip8s, schar, schar, double scale = *(double*)params_, scale)
|
|
|
|
DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(recip16u, ushort, ushort, double scale = *(double*)params_, scale)
|
|
|
|
DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(recip16s, short, short, double scale = *(double*)params_, scale)
|
|
|
|
DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(recip32u, unsigned, unsigned, double scale = *(double*)params_, scale)
|
|
|
|
DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(recip32s, int, int, double scale = *(double*)params_, scale)
|
|
|
|
DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(recip64u, uint64, uint64, double scale = *(double*)params_, scale)
|
|
|
|
DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(recip64s, int64, int64, double scale = *(double*)params_, scale)
|
2024-04-01 22:09:24 +08:00
|
|
|
DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(recip16f, cv_hal_f16, hfloat, double scale = *(double*)params_, scale)
|
2024-03-22 08:45:59 +08:00
|
|
|
DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(recip16bf, cv_hal_bf16, bfloat, double scale = *(double*)params_, scale)
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(recip32f, float, float, double scale = *(double*)params_, scale)
|
|
|
|
DEFINE_UNARY_OP_W_PARAMS_DISPATCHER(recip64f, double, double, double scale = *(double*)params_, scale)
|
2018-07-26 04:00:37 +08:00
|
|
|
|
Extended several core functions to support new types (#24962)
* started adding support for new types (16f, 16bf, 32u, 64u, 64s) to arithmetic functions
* fixed several tests; refactored and extended sum(), extended inRange().
* extended countNonZero(), mean(), meanStdDev(), minMaxIdx(), norm() and sum() to support new types (F16, BF16, U32, U64, S64)
* put missing CV_DEPTH_MAX to some function dispatcher tables
* extended findnonzero, hasnonzero with the new types support
* extended mixChannels() to support new types
* minor fix
* fixed a few compile errors on Linux and a few failures in core tests
* fixed a few more warnings and test failures
* trying to fix the remaining warnings and test failures. The test `MulTestGPU.MathOpTest` was disabled - not clear whether to set tolerance - it's not bit-exact operation, as possibly assumed by the test, due to the use of scale and possibly limited accuracy of the intermediate floating-point calculations.
* found that in the current snapshot G-API produces incorrect results in Mul, Div and AddWeighted (at least when using OpenCL on Windows x64 or MacOS x64). Disabled the respective tests.
2024-02-11 15:42:41 +08:00
|
|
|
#endif
|
2018-07-26 04:00:37 +08:00
|
|
|
|
|
|
|
#ifndef ARITHM_DISPATCHING_ONLY
|
|
|
|
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
|
|
|
#endif
|
|
|
|
|
2021-05-26 01:15:12 +08:00
|
|
|
}} // cv::hal::
|