Merge branch 4.x

This commit is contained in:
Alexander Smorkalov 2023-10-16 21:25:56 +03:00
commit 97620c053f
221 changed files with 3584 additions and 2215 deletions

View File

@ -16,6 +16,7 @@ ocv_warnings_disable(CMAKE_C_FLAGS
-Wunused-but-set-variable # clang15
-Wmissing-prototypes # clang, function opj_t1_ht_decode_cblk
-Wmissing-declarations # gcc, function opj_t1_ht_decode_cblk
-Wdocumentation # clang
)
#-----------------------------------------------------------------------------

View File

@ -27,6 +27,8 @@ else()
-Wimplicit-fallthrough
-Warray-bounds # GCC 9+
-Wstringop-overflow -Wstringop-overread # GCC 11-12
-Wextra-semi # clang
-Wcomma # clang
)
endif()
if(CV_ICC)

View File

@ -209,7 +209,7 @@ if(NOT ${found})
message(STATUS " PYTHON3_NUMPY_INCLUDE_DIRS")
else()
# Attempt to discover the NumPy include directory. If this succeeds, then build python API with NumPy
execute_process(COMMAND "${_executable}" -c "import os; os.environ['DISTUTILS_USE_SDK']='1'; import numpy.distutils; print(os.pathsep.join(numpy.distutils.misc_util.get_numpy_include_dirs()))"
execute_process(COMMAND "${_executable}" -c "import numpy; print(numpy.get_include())"
RESULT_VARIABLE _numpy_process
OUTPUT_VARIABLE _numpy_include_dirs
OUTPUT_STRIP_TRAILING_WHITESPACE)

View File

@ -186,6 +186,8 @@ class PatternMaker:
yspacing = (self.height - self.rows * self.square_size) / 2.0
ch_ar_border = (self.square_size - self.aruco_marker_size)/2
if ch_ar_border < side*0.7:
print("Marker border {} is less than 70% of ArUco pin size {}. Please increase --square_size or decrease --marker_size for stable board detection".format(ch_ar_border, int(side)))
marker_id = 0
for y in range(0, self.rows):
for x in range(0, self.cols):
@ -283,6 +285,9 @@ def main():
else:
raise ValueError("The marker {},{} is outside the checkerboard".format(x, y))
if p_type == "charuco_board" and aruco_marker_size >= square_size:
raise ValueError("ArUco markers size must be smaller than square size")
pm = PatternMaker(columns, rows, output, units, square_size, radius_rate, page_width, page_height, markers, aruco_marker_size, dict_file)
# dict for easy lookup of pattern type
mp = {"circles": pm.make_circles_pattern, "acircles": pm.make_acircles_pattern,

View File

@ -112,7 +112,7 @@ public:
* 2 columns 1 channel
* @param _m2 destination points containing (x,y), depth is CV_32F with 1 column 2 channels or
* 2 columns 1 channel
* @param _model, CV_64FC1, 3x3, normalized, i.e., the last element is 1
* @param _model CV_64FC1, 3x3, normalized, i.e., the last element is 1
*/
int runKernel( InputArray _m1, InputArray _m2, OutputArray _model ) const CV_OVERRIDE
{
@ -187,7 +187,7 @@ public:
* @param _m1 depth CV_32F, 1-channel with 2 columns or 2-channel with 1 column
* @param _m2 depth CV_32F, 1-channel with 2 columns or 2-channel with 1 column
* @param _model CV_64FC1, 3x3
* @param _err, output, CV_32FC1, square of the L2 norm
* @param _err output, CV_32FC1, square of the L2 norm
*/
void computeError( InputArray _m1, InputArray _m2, InputArray _model, OutputArray _err ) const CV_OVERRIDE
{

View File

@ -111,7 +111,7 @@ private:
/**
* @brief Computes the translation solution for a given rotation solution
* @param objectPoints Array of corresponding object points, 1xN/Nx1 3-channel where N is the number of points
* @param normalizedImagePoints Array of corresponding image points (undistorted), 1xN/Nx1 2-channel where N is the number of points
* @param normalizedImgPoints Array of corresponding image points (undistorted), 1xN/Nx1 2-channel where N is the number of points
* @param R Rotation solution (3x1 rotation vector)
* @param t Translation solution (3x1 rotation vector)
*/
@ -220,10 +220,10 @@ private:
/**
* @brief Computes the average depth of an object given its pose in camera coordinates
* @param objectPoints: Object points defined in 3D object space
* @param rvec: Rotation component of pose
* @param tvec: Translation component of pose
* @return: average depth of the object
* @param objectPoints Object points defined in 3D object space
* @param rvec Rotation component of pose
* @param tvec Translation component of pose
* @return average depth of the object
*/
double meanSceneDepth(InputArray objectPoints, InputArray rvec, InputArray tvec);

View File

@ -220,8 +220,8 @@ int p3p::solve(double R[4][3][3], double t[4][3],
/// Only the solution to the main branch.
/// Reference : X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang; "Complete Solution Classification for the Perspective-Three-Point Problem"
/// IEEE Trans. on PAMI, vol. 25, No. 8, August 2003
/// \param lengths3D Lengths of line segments up to four solutions.
/// \param dist3D Distance between 3D points in pairs |BC|, |AC|, |AB|.
/// \param lengths Lengths of line segments up to four solutions.
/// \param distances Distance between 3D points in pairs |BC|, |AC|, |AB|.
/// \param cosines Cosine of the angles /_BPC, /_APC, /_APB.
/// \returns Number of solutions.
/// WARNING: NOT ALL THE DEGENERATE CASES ARE IMPLEMENTED

View File

@ -89,7 +89,7 @@ namespace cv {
* @param ep outlier ratio
* @param modelPoints number of model points required for estimation
* @param maxIters maximum number of iterations
* @return
* @return The number of iterations according to the formula
* \f[
* \frac{\ln(1-p)}{\ln\left(1-(1-ep)^\mathrm{modelPoints}\right)}
* \f]

View File

@ -486,7 +486,7 @@ void rhoSeed(Ptr<RHO_HEST> p, uint64_t seed){
* Estimates the homography using the given context, matches and parameters to
* PROSAC.
*
* @param [in/out] p The context to use for homography estimation. Must
* @param [in,out] p The context to use for homography estimation. Must
* be already initialized. Cannot be NULL.
* @param [in] src The pointer to the source points of the matches.
* Must be aligned to 4 bytes. Cannot be NULL.

View File

@ -206,7 +206,7 @@ void rhoSeed(Ptr<RHO_HEST> p, uint64_t seed);
* homography with at least the minimum required support, and 0 if it was not.
*
*
* @param [in/out] p The context to use for homography estimation. Must
* @param [in,out] p The context to use for homography estimation. Must
* be already initialized. Cannot be NULL.
* @param [in] src The pointer to the source points of the matches.
* Must be aligned to 4 bytes. Cannot be NULL.

View File

@ -89,8 +89,8 @@ public:
s2(_s2),
s3(_s3),
s4(_s4) {
#if CV_SIMD_64F
for (int i = 0; i < 2 * v_float64::nlanes; ++i)
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
{
s_x[i] = ir[0] * i;
s_y[i] = ir[3] * i;
@ -123,26 +123,26 @@ public:
else
CV_Assert(m1 != NULL);
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
const v_float64 v_one = vx_setall_f64(1.0);
for (; j <= size.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes, _x += 2*v_float64::nlanes * ir[0], _y += 2*v_float64::nlanes * ir[3], _w += 2*v_float64::nlanes * ir[6])
for (; j <= size.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes(), _x += 2*VTraits<v_float64>::vlanes() * ir[0], _y += 2*VTraits<v_float64>::vlanes() * ir[3], _w += 2*VTraits<v_float64>::vlanes() * ir[6])
{
v_float64 m_0, m_1, m_2, m_3;
m_2 = v_one / (vx_setall_f64(_w) + vx_load(s_w));
m_3 = v_one / (vx_setall_f64(_w) + vx_load(s_w + v_float64::nlanes));
m_2 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w)));
m_3 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w + VTraits<v_float64>::vlanes())));
m_0 = vx_setall_f64(_x); m_1 = vx_setall_f64(_y);
v_float64 x_0 = (m_0 + vx_load(s_x)) * m_2;
v_float64 x_1 = (m_0 + vx_load(s_x + v_float64::nlanes)) * m_3;
v_float64 y_0 = (m_1 + vx_load(s_y)) * m_2;
v_float64 y_1 = (m_1 + vx_load(s_y + v_float64::nlanes)) * m_3;
v_float64 x_0 = v_mul(v_add(m_0, vx_load(this->s_x)), m_2);
v_float64 x_1 = v_mul(v_add(m_0, vx_load(this->s_x + VTraits<v_float64>::vlanes())), m_3);
v_float64 y_0 = v_mul(v_add(m_1, vx_load(this->s_y)), m_2);
v_float64 y_1 = v_mul(v_add(m_1, vx_load(this->s_y + VTraits<v_float64>::vlanes())), m_3);
v_float64 xd_0 = x_0 * x_0;
v_float64 yd_0 = y_0 * y_0;
v_float64 xd_1 = x_1 * x_1;
v_float64 yd_1 = y_1 * y_1;
v_float64 xd_0 = v_mul(x_0, x_0);
v_float64 yd_0 = v_mul(y_0, y_0);
v_float64 xd_1 = v_mul(x_1, x_1);
v_float64 yd_1 = v_mul(y_1, y_1);
v_float64 r2_0 = xd_0 + yd_0;
v_float64 r2_1 = xd_1 + yd_1;
v_float64 r2_0 = v_add(xd_0, yd_0);
v_float64 r2_1 = v_add(xd_1, yd_1);
m_1 = vx_setall_f64(k3);
m_2 = vx_setall_f64(k2);
@ -151,18 +151,18 @@ public:
m_1 = v_muladd(v_muladd(v_muladd(m_1, r2_1, m_2), r2_1, m_3), r2_1, v_one);
m_3 = vx_setall_f64(k6);
m_2 = vx_setall_f64(k5);
m_0 /= v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(k4)), r2_0, v_one);
m_1 /= v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(k4)), r2_1, v_one);
m_0 = v_div(m_0, v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(this->k4)), r2_0, v_one));
m_1 = v_div(m_1, v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(this->k4)), r2_1, v_one));
m_3 = vx_setall_f64(2.0);
xd_0 = v_muladd(m_3, xd_0, r2_0);
yd_0 = v_muladd(m_3, yd_0, r2_0);
xd_1 = v_muladd(m_3, xd_1, r2_1);
yd_1 = v_muladd(m_3, yd_1, r2_1);
m_2 = x_0 * y_0 * m_3;
m_3 = x_1 * y_1 * m_3;
m_2 = v_mul(v_mul(x_0, y_0), m_3);
m_3 = v_mul(v_mul(x_1, y_1), m_3);
x_0 *= m_0; y_0 *= m_0; x_1 *= m_1; y_1 *= m_1;
x_0 = v_mul(x_0, m_0); y_0 = v_mul(y_0, m_0); x_1 = v_mul(x_1, m_1); y_1 = v_mul(y_1, m_1);
m_0 = vx_setall_f64(p1);
m_1 = vx_setall_f64(p2);
@ -176,8 +176,8 @@ public:
xd_1 = v_muladd(m_0, m_3, xd_1);
yd_1 = v_muladd(m_1, m_3, yd_1);
m_0 = r2_0 * r2_0;
m_1 = r2_1 * r2_1;
m_0 = v_mul(r2_0, r2_0);
m_1 = v_mul(r2_1, r2_1);
m_2 = vx_setall_f64(s2);
m_3 = vx_setall_f64(s1);
xd_0 = v_muladd(m_3, r2_0, v_muladd(m_2, m_0, xd_0));
@ -203,17 +203,17 @@ public:
r2_0 = v_muladd(m_0, xd_0, v_muladd(m_1, yd_0, m_2));
r2_1 = v_muladd(m_0, xd_1, v_muladd(m_1, yd_1, m_2));
m_0 = vx_setzero_f64();
r2_0 = v_select(r2_0 == m_0, v_one, v_one / r2_0);
r2_1 = v_select(r2_1 == m_0, v_one, v_one / r2_1);
r2_0 = v_select(v_eq(r2_0, m_0), v_one, v_div(v_one, r2_0));
r2_1 = v_select(v_eq(r2_1, m_0), v_one, v_div(v_one, r2_1));
m_0 = vx_setall_f64(fx);
m_1 = vx_setall_f64(u0);
m_2 = vx_setall_f64(fy);
m_3 = vx_setall_f64(v0);
x_0 = v_muladd(m_0 * r2_0, x_0, m_1);
y_0 = v_muladd(m_2 * r2_0, y_0, m_3);
x_1 = v_muladd(m_0 * r2_1, x_1, m_1);
y_1 = v_muladd(m_2 * r2_1, y_1, m_3);
x_0 = v_muladd(v_mul(m_0, r2_0), x_0, m_1);
y_0 = v_muladd(v_mul(m_2, r2_0), y_0, m_3);
x_1 = v_muladd(v_mul(m_0, r2_1), x_1, m_1);
y_1 = v_muladd(v_mul(m_2, r2_1), y_1, m_3);
if (m1type == CV_32FC1)
{
@ -225,20 +225,20 @@ public:
v_float32 mf0, mf1;
v_zip(v_cvt_f32(x_0, x_1), v_cvt_f32(y_0, y_1), mf0, mf1);
v_store(&m1f[j * 2], mf0);
v_store(&m1f[j * 2 + v_float32::nlanes], mf1);
v_store(&m1f[j * 2 + VTraits<v_float32>::vlanes()], mf1);
}
else // m1type == CV_16SC2
{
m_0 = vx_setall_f64(INTER_TAB_SIZE);
x_0 *= m_0; x_1 *= m_0; y_0 *= m_0; y_1 *= m_0;
x_0 = v_mul(x_0, m_0); x_1 = v_mul(x_1, m_0); y_0 = v_mul(y_0, m_0); y_1 = v_mul(y_1, m_0);
v_int32 mask = vx_setall_s32(INTER_TAB_SIZE - 1);
v_int32 iu = v_round(x_0, x_1);
v_int32 iv = v_round(y_0, y_1);
v_pack_u_store(&m2[j], (iu & mask) + (iv & mask) * vx_setall_s32(INTER_TAB_SIZE));
v_pack_u_store(&m2[j], v_add(v_and(iu, mask), v_mul(v_and(iv, mask), vx_setall_s32(INTER_TAB_SIZE))));
v_int32 out0, out1;
v_zip(iu >> INTER_BITS, iv >> INTER_BITS, out0, out1);
v_zip(v_shr<INTER_BITS>(iu), v_shr<INTER_BITS>(iv), out0, out1);
v_store(&m1[j * 2], v_pack(out0, out1));
}
}
@ -302,10 +302,10 @@ private:
double s2;
double s3;
double s4;
#if CV_SIMD_64F
double s_x[2*v_float64::nlanes];
double s_y[2*v_float64::nlanes];
double s_w[2*v_float64::nlanes];
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
double s_x[2*VTraits<v_float64>::max_nlanes];
double s_y[2*VTraits<v_float64>::max_nlanes];
double s_w[2*VTraits<v_float64>::max_nlanes];
#endif
};
}

View File

@ -203,12 +203,12 @@ class Chessboard: public cv::Feature2D
* d12/d34 = d13/d24
*
* point order on the line:
* pt1 --> pt2 --> pt3 --> pt4
* p0 --> p1 --> p2 --> p3
*
* \param[in] pt1 First point coordinate
* \param[in] pt2 Second point coordinate
* \param[in] pt3 Third point coordinate
* \param[out] pt4 Forth point coordinate
* \param[in] p0 First point coordinate
* \param[in] p1 Second point coordinate
* \param[in] p2 Third point coordinate
* \param[out] p3 Forth point coordinate
*
*/
static bool estimatePoint(const cv::Point2f &p0,const cv::Point2f &p1,const cv::Point2f &p2,cv::Point2f &p3);
@ -309,7 +309,7 @@ class Chessboard: public cv::Feature2D
* \brief Draws the corners into the given image
*
* \param[in] m The image
* \param[out] m The resulting image
* \param[out] out The resulting image
* \param[in] H optional homography to calculate search area
*
*/
@ -668,7 +668,7 @@ class Chessboard: public cv::Feature2D
* \brief Calculates the average edge sharpness for the chessboard
*
* \param[in] image The image where the chessboard was detected
* \param[in] rise_distante Rise distance 0.8 means 10% ... 90%
* \param[in] rise_distance Rise distance 0.8 means 10% ... 90%
* \param[in] vertical by default only edge response for horiontal lines are calculated
*
* \returns Scalar(sharpness, average min_val, average max_val)

View File

@ -66,7 +66,7 @@ namespace cv {
* @param ep outlier ratio
* @param modelPoints number of model points required for estimation
* @param maxIters maximum number of iterations
* @return
* @return The number of iterations according to the formula
* \f[
* \frac{\ln(1-p)}{\ln\left(1-(1-ep)^\mathrm{modelPoints}\right)}
* \f]

View File

@ -36,15 +36,15 @@
namespace cv {
template <typename T>
DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){};
DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){}
template <typename T>
DualQuat<T>::DualQuat(const T vw, const T vx, const T vy, const T vz, const T _w, const T _x, const T _y, const T _z):
w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){};
w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){}
template <typename T>
DualQuat<T>::DualQuat(const Vec<T, 8> &q):w(q[0]), x(q[1]), y(q[2]), z(q[3]),
w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){};
w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){}
template <typename T>
DualQuat<T> DualQuat<T>::createFromQuat(const Quat<T> &realPart, const Quat<T> &dualPart)

View File

@ -987,6 +987,15 @@ namespace CV__SIMD_NAMESPACE {
{ \
return a op b; \
}
#define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
{ \
return a == b; \
} \
inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
{ \
return a != b; \
}
#define OPENCV_HAL_WRAP_CMP(_Tpvec) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
@ -999,11 +1008,11 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_CMP(v_uint8)
OPENCV_HAL_WRAP_CMP(v_uint16)
OPENCV_HAL_WRAP_CMP(v_uint32)
// OPENCV_HAL_WRAP_CMP(v_uint64)
OPENCV_HAL_WRAP_EQ_OP(v_uint64)
OPENCV_HAL_WRAP_CMP(v_int8)
OPENCV_HAL_WRAP_CMP(v_int16)
OPENCV_HAL_WRAP_CMP(v_int32)
// OPENCV_HAL_WRAP_CMP(v_int64)
OPENCV_HAL_WRAP_EQ_OP(v_int64)
OPENCV_HAL_WRAP_CMP(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64)
@ -1012,9 +1021,11 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_CMP(v_uint8x16)
OPENCV_HAL_WRAP_CMP(v_uint16x8)
OPENCV_HAL_WRAP_CMP(v_uint32x4)
OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
OPENCV_HAL_WRAP_CMP(v_int8x16)
OPENCV_HAL_WRAP_CMP(v_int16x8)
OPENCV_HAL_WRAP_CMP(v_int32x4)
OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
OPENCV_HAL_WRAP_CMP(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x2)
@ -1024,9 +1035,11 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_CMP(v_uint8x32)
OPENCV_HAL_WRAP_CMP(v_uint16x16)
OPENCV_HAL_WRAP_CMP(v_uint32x8)
OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
OPENCV_HAL_WRAP_CMP(v_int8x32)
OPENCV_HAL_WRAP_CMP(v_int16x16)
OPENCV_HAL_WRAP_CMP(v_int32x8)
OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
OPENCV_HAL_WRAP_CMP(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x4)

View File

@ -188,4 +188,4 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
} // cv::
} // cv::

View File

@ -0,0 +1,33 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// 0.11 -> 0.12 compatibility
#ifndef _RVV_IMPLICIT_VXRM
#define _RVV_IMPLICIT_VXRM __RISCV_VXRM_RNU
#endif
// NOTE: masked should go first to avoid extra substitution (3 arg -> 4 arg -> 5 arg)
// masked
#define __riscv_vaadd(_1, _2, _3, _4) __riscv_vaadd(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vasub(_1, _2, _3, _4) __riscv_vasub(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vaaddu(_1, _2, _3, _4) __riscv_vaaddu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vasubu(_1, _2, _3, _4) __riscv_vasubu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vsmul(_1, _2, _3, _4) __riscv_vsmul(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vssra(_1, _2, _3, _4) __riscv_vssra(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vssrl(_1, _2, _3, _4) __riscv_vssrl(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vnclip(_1, _2, _3, _4) __riscv_vnclip(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vnclipu(_1, _2, _3, _4) __riscv_vnclipu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
// unmasked
#define __riscv_vaadd(_1, _2, _3) __riscv_vaadd(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vasub(_1, _2, _3) __riscv_vasub(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vaaddu(_1, _2, _3) __riscv_vaaddu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vasubu(_1, _2, _3) __riscv_vasubu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vsmul(_1, _2, _3) __riscv_vsmul(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vssra(_1, _2, _3) __riscv_vssra(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vssrl(_1, _2, _3) __riscv_vssrl(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vnclip(_1, _2, _3) __riscv_vnclip(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vnclipu(_1, _2, _3) __riscv_vnclipu(_1, _2, _RVV_IMPLICIT_VXRM, _3)

View File

@ -21,6 +21,10 @@
#include "intrin_rvv_010_compat_overloaded-non-policy.hpp"
#endif
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999
#include "intrin_rvv_011_compat.hpp"
#endif
#if defined(__GNUC__) && !defined(__clang__)
// FIXIT: eliminate massive warnigs from templates
// GCC from 'rvv-next': riscv64-unknown-linux-gnu-g++ (g42df3464463) 12.0.1 20220505 (prerelease)

View File

@ -225,7 +225,7 @@ public:
void copyTo(const _OutputArray& dst) const;
void convertTo(const _OutputArray& dst, int type, double scale=1., double shift=0.) const;
_Tp val[m*n]; //< matrix elements
_Tp val[m*n]; ///< matrix elements
};
typedef Matx<float, 1, 2> Matx12f;

View File

@ -774,7 +774,7 @@ public:
void start();
void stop();
uint64 durationNS() const; //< duration in nanoseconds
uint64 durationNS() const; ///< duration in nanoseconds
protected:
struct Impl;

View File

@ -89,7 +89,7 @@ public:
//! conjugation
Complex conj() const;
_Tp re, im; //< the real and the imaginary parts
_Tp re, im; ///< the real and the imaginary parts
};
typedef Complex<float> Complexf;
@ -2028,8 +2028,8 @@ double jaccardDistance(const Rect_<_Tp>& a, const Rect_<_Tp>& b) {
/** @brief Finds out if there is any intersection between two rectangles
*
* mainly useful for language bindings
* @param rect1 First rectangle
* @param rect2 Second rectangle
* @param a First rectangle
* @param b Second rectangle
* @return the area of the intersection
*/
CV_EXPORTS_W inline double rectangleIntersectionArea(const Rect2d& a, const Rect2d& b) { return (a & b).area(); }

View File

@ -47,11 +47,11 @@ public:
explicit FileLock(const char* fname);
~FileLock();
void lock(); //< acquire exclusive (writer) lock
void unlock(); //< release exclusive (writer) lock
void lock(); ///< acquire exclusive (writer) lock
void unlock(); ///< release exclusive (writer) lock
void lock_shared(); //< acquire shareable (reader) lock
void unlock_shared(); //< release shareable (reader) lock
void lock_shared(); ///< acquire shareable (reader) lock
void unlock_shared(); ///< release shareable (reader) lock
struct Impl;
protected:

View File

@ -70,11 +70,11 @@ public:
struct LocationExtraData;
struct LocationStaticStorage
{
LocationExtraData** ppExtra; //< implementation specific data
const char* name; //< region name (function name or other custom name)
const char* filename; //< source code filename
int line; //< source code line
int flags; //< flags (implementation code path: Plain, IPP, OpenCL)
LocationExtraData** ppExtra; ///< implementation specific data
const char* name; ///< region name (function name or other custom name)
const char* filename; ///< source code filename
int line; ///< source code line
int flags; ///< flags (implementation code path: Plain, IPP, OpenCL)
};
Region(const LocationStaticStorage& location);
@ -100,18 +100,18 @@ private:
//! Specify region flags
enum RegionLocationFlag {
REGION_FLAG_FUNCTION = (1 << 0), //< region is function (=1) / nested named region (=0)
REGION_FLAG_APP_CODE = (1 << 1), //< region is Application code (=1) / OpenCV library code (=0)
REGION_FLAG_SKIP_NESTED = (1 << 2), //< avoid processing of nested regions
REGION_FLAG_FUNCTION = (1 << 0), ///< region is function (=1) / nested named region (=0)
REGION_FLAG_APP_CODE = (1 << 1), ///< region is Application code (=1) / OpenCV library code (=0)
REGION_FLAG_SKIP_NESTED = (1 << 2), ///< avoid processing of nested regions
REGION_FLAG_IMPL_IPP = (1 << 16), //< region is part of IPP code path
REGION_FLAG_IMPL_OPENCL = (2 << 16), //< region is part of OpenCL code path
REGION_FLAG_IMPL_OPENVX = (3 << 16), //< region is part of OpenVX code path
REGION_FLAG_IMPL_IPP = (1 << 16), ///< region is part of IPP code path
REGION_FLAG_IMPL_OPENCL = (2 << 16), ///< region is part of OpenCL code path
REGION_FLAG_IMPL_OPENVX = (3 << 16), ///< region is part of OpenVX code path
REGION_FLAG_IMPL_MASK = (15 << 16),
REGION_FLAG_REGION_FORCE = (1 << 30),
REGION_FLAG_REGION_NEXT = (1 << 31), //< close previous region (see #CV_TRACE_REGION_NEXT macro)
REGION_FLAG_REGION_NEXT = (1 << 31), ///< close previous region (see #CV_TRACE_REGION_NEXT macro)
ENUM_REGION_FLAG_FORCE_INT = INT_MAX
};

View File

@ -962,9 +962,9 @@ public class CoreTest extends OpenCVTestCase {
assertEquals(0.0, d);
d = Core.Mahalanobis(line1, line2, covar);
assertTrue(d > 0.0);
// Bug: https://github.com/opencv/opencv/issues/24348
// d = Core.Mahalanobis(line1, line2, covar);
// assertTrue(d > 0.0);
}
public void testMax() {

View File

@ -2,7 +2,7 @@
#include "opencv2/core/async.hpp"
CV_PY_TO_CLASS(AsyncArray);
CV_PY_FROM_CLASS(AsyncArray);
CV_PY_TO_CLASS(AsyncArray)
CV_PY_FROM_CLASS(AsyncArray)
#endif

View File

@ -20,18 +20,18 @@ template<> struct pyopencvVecConverter<cuda::GpuMat>
}
};
CV_PY_TO_CLASS(cuda::GpuMat);
CV_PY_TO_CLASS(cuda::Stream);
CV_PY_TO_CLASS(cuda::Event);
CV_PY_TO_CLASS(cuda::HostMem);
CV_PY_TO_CLASS(cuda::GpuMat)
CV_PY_TO_CLASS(cuda::Stream)
CV_PY_TO_CLASS(cuda::Event)
CV_PY_TO_CLASS(cuda::HostMem)
CV_PY_TO_CLASS_PTR(cuda::GpuMat);
CV_PY_TO_CLASS_PTR(cuda::GpuMat::Allocator);
CV_PY_TO_CLASS_PTR(cuda::GpuMat)
CV_PY_TO_CLASS_PTR(cuda::GpuMat::Allocator)
CV_PY_FROM_CLASS(cuda::GpuMat);
CV_PY_FROM_CLASS(cuda::Stream);
CV_PY_FROM_CLASS(cuda::HostMem);
CV_PY_FROM_CLASS(cuda::GpuMat)
CV_PY_FROM_CLASS(cuda::Stream)
CV_PY_FROM_CLASS(cuda::HostMem)
CV_PY_FROM_CLASS_PTR(cuda::GpuMat::Allocator);
CV_PY_FROM_CLASS_PTR(cuda::GpuMat::Allocator)
#endif

View File

@ -4,8 +4,8 @@
typedef std::vector<Range> vector_Range;
CV_PY_TO_CLASS(UMat);
CV_PY_FROM_CLASS(UMat);
CV_PY_TO_CLASS(UMat)
CV_PY_FROM_CLASS(UMat)
static bool cv_mappable_to(const Ptr<Mat>& src, Ptr<UMat>& dst)
{

View File

@ -45,4 +45,4 @@ PERF_TEST_P(MatDepth_tb, DISABLED_Allocation_Aligned,
SANITY_CHECK_NOTHING();
}
};
}

View File

@ -53,7 +53,6 @@
#undef CV__ALLOCATOR_STATS_LOG
//#define OPENCV_ALLOC_ENABLE_STATISTICS
#define OPENCV_ALLOC_STATISTICS_LIMIT 4096 // don't track buffers less than N bytes
#ifdef HAVE_POSIX_MEMALIGN
@ -63,6 +62,7 @@
#endif
#ifdef OPENCV_ALLOC_ENABLE_STATISTICS
#define OPENCV_ALLOC_STATISTICS_LIMIT 4096 // don't track buffers less than N bytes
#include <map>
#endif

View File

@ -8,4 +8,4 @@
#include "arithm.simd_declarations.hpp"
#define ARITHM_DISPATCHING_ONLY
#include "arithm.simd.hpp"
#include "arithm.simd.hpp"

View File

@ -69,7 +69,7 @@
#define DEFINE_SIMD_F32(fun, ...) \
DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__)
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
#define DEFINE_SIMD_F64(fun, ...) \
DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__)
#else
@ -262,7 +262,7 @@ struct op_absdiff
template<>
struct op_absdiff<schar, v_int8>
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_int8 r(const v_int8& a, const v_int8& b)
{ return v_absdiffs(a, b); }
#endif
@ -272,7 +272,7 @@ struct op_absdiff<schar, v_int8>
template<>
struct op_absdiff<short, v_int16>
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_int16 r(const v_int16& a, const v_int16& b)
{ return v_absdiffs(a, b); }
#endif
@ -282,7 +282,7 @@ struct op_absdiff<short, v_int16>
template<>
struct op_absdiff<int, v_int32>
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_int32 r(const v_int32& a, const v_int32& b)
{ return v_reinterpret_as_s32(v_absdiff(a, b)); }
#endif
@ -327,7 +327,7 @@ struct op_not
//////////////////////////// Loaders /////////////////////////////////
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
struct bin_loader
@ -392,7 +392,7 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
{
typedef OP<T1, Tvec> op;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef bin_loader<OP, T1, Tvec> ldr;
const int wide_step = VTraits<Tvec>::vlanes();
#if !CV_NEON && CV_SIMD_WIDTH == 16
@ -410,7 +410,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
{
int x = 0;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
#if !CV_NEON && !CV_MSA
if (is_aligned(src1, src2, dst))
{
@ -460,7 +460,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
vx_cleanup();
}
#if !CV_SIMD_64F
#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
{
@ -492,7 +492,7 @@ static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t
#define BIN_LOOP64F bin_loop_nosimd
#else
#define BIN_LOOP64F bin_loop
#endif //!CV_SIMD_64F
#endif //!(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
#endif // ARITHM_DEFINITIONS_ONLY
@ -617,7 +617,7 @@ struct op_cmpne
//////////////////////////// Loaders /////////////////////////////////
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
// todo: add support for RW alignment & stream
template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
struct cmp_loader_n
@ -697,7 +697,7 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
{
typedef OP<T1, Tvec> op;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
const int wide_step = VTraits<Tvec>::vlanes() * sizeof(T1);
#endif // CV_SIMD
@ -709,7 +709,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
{
int x = 0;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (; x <= width - wide_step; x += wide_step)
{
ldr::l(src1 + x, src2 + x, dst + x);
@ -764,7 +764,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
}
}
#if !CV_SIMD_64F
#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
template< template<typename T1, typename Tvec> class OP, typename T1>
static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
{
@ -818,7 +818,7 @@ static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2
break;
}
}
#endif // !CV_SIMD_64F
#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
#endif // ARITHM_DEFINITIONS_ONLY
@ -876,7 +876,7 @@ DEFINE_SIMD_ALL(cmp)
//////////////////////////// Loaders ///////////////////////////////
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
// todo: add support for RW alignment & stream
template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
struct scalar_loader_n
@ -1095,16 +1095,16 @@ struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
};
#endif // CV_SIMD
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
template<template<typename T1, typename T2, typename Tvec> class OP>
struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
{
typedef OP<int, float, v_int32> op;
typedef OP<double, double, v_float64> op64;
enum {step = v_int32::nlanes};
static inline void l(const int* src1, const int* src2, const double* scalar, int* dst)
{
const int step = VTraits<v_int32>::vlanes();
v_int32 v_src1 = vx_load(src1);
v_int32 v_src2 = vx_load(src2);
v_int32 v_src1s = vx_load(src1 + step);
@ -1121,6 +1121,7 @@ struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
}
static inline void l(const int* src1, const double* scalar, int* dst)
{
const int step = VTraits<v_int32>::vlanes();
v_int32 v_src1 = vx_load(src1);
v_int32 v_src1s = vx_load(src1 + step);
@ -1165,10 +1166,10 @@ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
{
typedef OP<float, float, v_float32> op;
typedef OP<double, double, v_float64> op64;
enum {step = v_float32::nlanes};
static inline void l(const float* src1, const float* src2, const double* scalar, float* dst)
{
const int step = VTraits<v_float32>::vlanes();
v_float32 v_src1 = vx_load(src1);
v_float32 v_src2 = vx_load(src2);
v_float32 v_src1s = vx_load(src1 + step);
@ -1182,6 +1183,7 @@ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
}
static inline void l(const float* src1, const double* scalar, float* dst)
{
const int step = VTraits<v_float32>::vlanes();
v_float32 v_src1 = vx_load(src1);
v_float32 v_src1s = vx_load(src1 + step);
@ -1222,10 +1224,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP>
struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
{
typedef OP<double, double, v_float64> op;
enum {step = v_float64::nlanes};
static inline void l(const double* src1, const double* src2, const double* scalar, double* dst)
{
const int step = VTraits<v_float64>::vlanes();
v_float64 v_src1 = vx_load(src1);
v_float64 v_src2 = vx_load(src2);
v_float64 v_src1s = vx_load(src1 + step);
@ -1239,6 +1241,7 @@ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
}
static inline void l(const double* src1, const double* scalar, double* dst)
{
const int step = VTraits<v_float64>::vlanes();
v_float64 v_src1 = vx_load(src1);
v_float64 v_src1s = vx_load(src1 + step);
@ -1249,7 +1252,7 @@ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
v_store(dst + step, r1);
}
};
#endif // CV_SIMD_64F
#endif // (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
//////////////////////////// Loops /////////////////////////////////
@ -1259,7 +1262,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
T1* dst, size_t step, int width, int height, const T2* scalar)
{
typedef OP<T1, T2, Tvec> op;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
@ -1273,7 +1276,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
{
int x = 0;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (; x <= width - wide_step; x += wide_step)
{
ldr::l(src1 + x, src2 + x, scalar, dst + x);
@ -1305,7 +1308,7 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1
static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
{
typedef OP<T1, T2, Tvec> op;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
@ -1318,7 +1321,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
{
int x = 0;
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (; x <= width - wide_step; x += wide_step)
{
ldr::l(src1 + x, scalar, dst + x);
@ -1345,7 +1348,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
vx_cleanup();
}
#if !CV_SIMD_64F
#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
// dual source
template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2,
@ -1409,7 +1412,7 @@ static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t ste
#define SCALAR_LOOP64F scalar_loop_nosimd
#else
#define SCALAR_LOOP64F scalar_loop
#endif // !CV_SIMD_64F
#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
#endif // ARITHM_DEFINITIONS_ONLY
@ -1433,7 +1436,7 @@ struct op_mul
template<typename T1, typename T2, typename Tvec>
struct op_mul_scale
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1449,7 +1452,7 @@ struct op_mul_scale
template<>
struct op_mul_scale<double, double, v_float64>
{
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
{
const v_float64 v_scalar = vx_setall_f64(*scalar);
@ -1574,7 +1577,7 @@ struct op_div_f
template<typename T1, typename T2, typename Tvec>
struct op_div_scale
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1596,7 +1599,7 @@ struct op_div_scale
template<>
struct op_div_scale<float, float, v_float32>
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1610,7 +1613,7 @@ struct op_div_scale<float, float, v_float32>
template<>
struct op_div_scale<double, double, v_float64>
{
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
{
const v_float64 v_scalar = vx_setall_f64(*scalar);
@ -1682,7 +1685,7 @@ DEFINE_SIMD_ALL(div, div_loop)
template<typename T1, typename T2, typename Tvec>
struct op_add_scale
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{
const v_float32 v_alpha = vx_setall_f32(*scalar);
@ -1698,7 +1701,7 @@ struct op_add_scale
template<>
struct op_add_scale<double, double, v_float64>
{
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
{
const v_float64 v_alpha = vx_setall_f64(*scalar);
@ -1715,7 +1718,7 @@ struct op_add_scale<double, double, v_float64>
template<typename T1, typename T2, typename Tvec>
struct op_add_weighted
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
{
const v_float32 v_alpha = vx_setall_f32(scalars[0]);
@ -1733,7 +1736,7 @@ struct op_add_weighted
template<>
struct op_add_weighted<double, double, v_float64>
{
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars)
{
const v_float64 v_alpha = vx_setall_f64(scalars[0]);
@ -1832,7 +1835,7 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
template<typename T1, typename T2, typename Tvec>
struct op_recip
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const T2* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1854,7 +1857,7 @@ struct op_recip
template<>
struct op_recip<float, float, v_float32>
{
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline v_float32 r(const v_float32& a, const float* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1868,7 +1871,7 @@ struct op_recip<float, float, v_float32>
template<>
struct op_recip<double, double, v_float64>
{
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
static inline v_float64 r(const v_float64& a, const double* scalar)
{
const v_float64 v_scalar = vx_setall_f64(*scalar);

View File

@ -414,4 +414,4 @@ inline int arithm_ipp_mul32f(const float *src1, size_t step1, const float *src2,
#if !ARITHM_USE_IPP
#define ARITHM_CALL_IPP(...)
#endif
#endif

View File

@ -64,8 +64,6 @@ namespace cv
Discrete Fourier Transform
\****************************************************************************************/
#define CV_MAX_LOCAL_DFT_SIZE (1 << 15)
static unsigned char bitrevTab[] =
{
0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0,

View File

@ -69,10 +69,14 @@
/**
Add: _dst[i] = src1[i] + src2[i]_ @n
Sub: _dst[i] = src1[i] - src2[i]_
@param src1_data,src1_step first source image data and step
@param src2_data,src2_step second source image data and step
@param dst_data,dst_step destination image data and step
@param width,height dimensions of the images
@param src1_data first source image data
@param src1_step first source image step
@param src2_data second source image data
@param src2_step second source image step
@param dst_data destination image data
@param dst_step destination image step
@param width width of the images
@param height height of the images
*/
//! @addtogroup core_hal_interface_addsub Element-wise add and subtract
//! @{
@ -96,10 +100,14 @@ inline int hal_ni_sub64f(const double *src1_data, size_t src1_step, const double
/**
Minimum: _dst[i] = min(src1[i], src2[i])_ @n
Maximum: _dst[i] = max(src1[i], src2[i])_
@param src1_data,src1_step first source image data and step
@param src2_data,src2_step second source image data and step
@param dst_data,dst_step destination image data and step
@param width,height dimensions of the images
@param src1_data first source image data
@param src1_step first source image step
@param src2_data second source image data
@param src2_step second source image step
@param dst_data destination image data
@param dst_step destination image step
@param width width of the images
@param height height of the images
*/
//! @addtogroup core_hal_interface_minmax Element-wise minimum or maximum
//! @{
@ -122,11 +130,14 @@ inline int hal_ni_min64f(const double *src1_data, size_t src1_step, const double
/**
Absolute difference: _dst[i] = | src1[i] - src2[i] |_
@param src1_data,src1_step first source image data and step
@param src2_data,src2_step second source image data and step
@param dst_data,dst_step destination image data and step
@param width,height dimensions of the images
@param scale additional multiplier
@param src1_data first source image data
@param src1_step first source image step
@param src2_data second source image data
@param src2_step second source image step
@param dst_data destination image data
@param dst_step destination image step
@param width width of the images
@param height height of the images
*/
//! @addtogroup core_hal_interface_absdiff Element-wise absolute difference
//! @{
@ -144,10 +155,14 @@ Bitwise AND: _dst[i] = src1[i] & src2[i]_ @n
Bitwise OR: _dst[i] = src1[i] | src2[i]_ @n
Bitwise XOR: _dst[i] = src1[i] ^ src2[i]_ @n
Bitwise NOT: _dst[i] = !src[i]_
@param src1_data,src1_step first source image data and step
@param src2_data,src2_step second source image data and step
@param dst_data,dst_step destination image data and step
@param width,height dimensions of the images
@param src1_data first source image data
@param src1_step first source image step
@param src2_data second source image data
@param src2_step second source image step
@param dst_data destination image data
@param dst_step destination image step
@param width width of the images
@param height height of the images
*/
//! @addtogroup core_hal_interface_logical Bitwise logical operations
//! @{
@ -201,10 +216,14 @@ inline int hal_ni_not8u(const uchar *src_data, size_t src_step, uchar *dst_data,
/**
Compare: _dst[i] = src1[i] op src2[i]_
@param src1_data,src1_step first source image data and step
@param src2_data,src2_step second source image data and step
@param dst_data,dst_step destination image data and step
@param width,height dimensions of the images
@param src1_data first source image data
@param src1_step first source image step
@param src2_data second source image data
@param src2_step second source image step
@param dst_data destination image data
@param dst_step destination image step
@param width width of the images
@param height height of the images
@param operation one of (CV_HAL_CMP_EQ, CV_HAL_CMP_GT, ...)
*/
//! @addtogroup core_hal_interface_compare Element-wise compare
@ -230,10 +249,14 @@ inline int hal_ni_cmp64f(const double *src1_data, size_t src1_step, const double
/**
Multiply: _dst[i] = scale * src1[i] * src2[i]_
@param src1_data,src1_step first source image data and step
@param src2_data,src2_step second source image data and step
@param dst_data,dst_step destination image data and step
@param width,height dimensions of the images
@param src1_data first source image data
@param src1_step first source image step
@param src2_data second source image data
@param src2_step second source image step
@param dst_data destination image data
@param dst_step destination image step
@param width width of the images
@param height height of the images
@param scale additional multiplier
*/
//! @addtogroup core_hal_interface_multiply Element-wise multiply
@ -249,10 +272,14 @@ inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double
/**
Divide: _dst[i] = scale * src1[i] / src2[i]_
@param src1_data,src1_step first source image data and step
@param src2_data,src2_step second source image data and step
@param dst_data,dst_step destination image data and step
@param width,height dimensions of the images
@param src1_data first source image data and step
@param src1_step first source image data and step
@param src2_data second source image data and step
@param src2_step second source image data and step
@param dst_data destination image data and step
@param dst_step destination image data and step
@param width dimensions of the images
@param height dimensions of the images
@param scale additional multiplier
*/
//! @addtogroup core_hal_interface_divide Element-wise divide
@ -268,9 +295,12 @@ inline int hal_ni_div64f(const double *src1_data, size_t src1_step, const double
/**
Computes reciprocial: _dst[i] = scale / src[i]_
@param src_data,src_step source image data and step
@param dst_data,dst_step destination image data and step
@param width,height dimensions of the images
@param src_data source image data
@param src_step source image step
@param dst_data destination image data
@param dst_step destination image step
@param width width of the images
@param height height of the images
@param scale additional multiplier
*/
//! @addtogroup core_hal_interface_reciprocial Element-wise reciprocial
@ -310,10 +340,14 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_
/**
Computes weighted sum of two arrays using formula: _dst[i] = a * src1[i] + b * src2[i] + c_
@param src1_data,src1_step first source image data and step
@param src2_data,src2_step second source image data and step
@param dst_data,dst_step destination image data and step
@param width,height dimensions of the images
@param src1_data first source image data
@param src1_step first source image step
@param src2_data second source image data
@param src2_step second source image step
@param dst_data destination image data
@param dst_step destination image step
@param width width of the images
@param height height of the images
@param scalars numbers _a_, _b_, and _c_
*/
//! @addtogroup core_hal_interface_addWeighted Element-wise weighted sum
@ -381,7 +415,8 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int
/**
@param y,x source Y and X arrays
@param y source Y arrays
@param x source X arrays
@param dst destination array
@param len length of arrays
@param angleInDegrees if set to true return angles in degrees, otherwise in radians
@ -399,7 +434,8 @@ inline int hal_ni_fastAtan64f(const double* y, const double* x, double* dst, int
/**
@param x,y source X and Y arrays
@param x source X array
@param y source Y array
@param dst destination array
@param len length of arrays
*/
@ -530,7 +566,8 @@ inline int hal_ni_dftFree1D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME
/**
@param context double pointer to context storing all necessary data
@param width,height image dimensions
@param width image width
@param height image height
@param depth image type (CV_32F or CV_64F)
@param src_channels number of channels in input image
@param dst_channels number of channels in output image
@ -540,8 +577,10 @@ inline int hal_ni_dftFree1D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME
inline int hal_ni_dftInit2D(cvhalDFT **context, int width, int height, int depth, int src_channels, int dst_channels, int flags, int nonzero_rows) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
/**
@param context pointer to context storing all necessary data
@param src_data,src_step source image data and step
@param dst_data,dst_step destination image data and step
@param src_data source image data
@param src_step source image step
@param dst_data destination image data
@param dst_step destination image step
*/
inline int hal_ni_dft2D(cvhalDFT *context, const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
/**
@ -557,15 +596,18 @@ inline int hal_ni_dftFree2D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME
/**
@param context double pointer to context storing all necessary data
@param width,height image dimensions
@param width image width
@param height image height
@param depth image type (CV_32F or CV_64F)
@param flags algorithm options (combination of CV_HAL_DFT_INVERSE, ...)
*/
inline int hal_ni_dctInit2D(cvhalDFT **context, int width, int height, int depth, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
/**
@param context pointer to context storing all necessary data
@param src_data,src_step source image data and step
@param dst_data,dst_step destination image data and step
@param src_data source image data
@param src_step source image step
@param dst_data destination image data
@param dst_step destination image step
*/
inline int hal_ni_dct2D(cvhalDFT *context, const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
/**
@ -717,11 +759,15 @@ inline int hal_ni_gemm64fc(const double* src1, size_t src1_step, const double* s
/**
@brief Finds the global minimum and maximum in an array.
@param src_data,src_step Source image
@param width,height Source image dimensions
@param src_data Source image
@param src_step Source image
@param width Source image dimensions
@param height Source image dimensions
@param depth Depth of source image
@param minVal,maxVal Pointer to the returned global minimum and maximum in an array.
@param minIdx,maxIdx Pointer to the returned minimum and maximum location.
@param minVal Pointer to the returned global minimum and maximum in an array.
@param maxVal Pointer to the returned global minimum and maximum in an array.
@param minIdx Pointer to the returned minimum and maximum location.
@param maxIdx Pointer to the returned minimum and maximum location.
@param mask Specified array region.
*/
inline int hal_ni_minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth, double* minVal, double* maxVal,
@ -731,6 +777,47 @@ inline int hal_ni_minMaxIdx(const uchar* src_data, size_t src_step, int width, i
#define cv_hal_minMaxIdx hal_ni_minMaxIdx
//! @endcond
/**
@brief hal_flip
@param src_type source and destination image type
@param src_data source image data
@param src_step source image step
@param src_width source and destination image width
@param src_height source and destination image height
@param dst_data destination image data
@param dst_step destination image step
@param flip_mode 0 flips around x-axis, positive around y-axis, negative both
*/
inline int hal_ni_flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
uchar* dst_data, size_t dst_step, int flip_mode) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
//! @cond IGNORED
#define cv_hal_flip hal_ni_flip
//! @endcond
/**
@brief rotate90
@param src_type source and destination image type
@param src_data source image data
@param src_step source image step
@param src_width source image width
If angle has value [180] it is also destination image width
If angle has values [90, 270] it is also destination image height
@param src_height source and destination image height (destination image width for angles [90, 270])
If angle has value [180] it is also destination image height
If angle has values [90, 270] it is also destination image width
@param dst_data destination image data
@param dst_step destination image step
@param angle clockwise angle for rotation in degrees from set [90, 180, 270]
*/
inline int hal_ni_rotate90(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
uchar* dst_data, size_t dst_step, int angle) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
//! @cond IGNORED
#define cv_hal_rotate90 hal_ni_rotate90
//! @endcond
//! @}

View File

@ -87,11 +87,11 @@ static bool hasNonZero8u( const uchar* src, size_t len )
{
bool res = false;
const uchar* srcEnd = src+len;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef v_uint8 v_type;
const v_type v_zero = vx_setzero_u8();
constexpr const int unrollCount = 2;
int step = v_type::nlanes * unrollCount;
int step = VTraits<v_type>::vlanes() * unrollCount;
int len0 = len & -step;
const uchar* srcSimdEnd = src+len0;
@ -99,10 +99,10 @@ static bool hasNonZero8u( const uchar* src, size_t len )
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v1 = vx_load(src);
src += v_type::nlanes;
res = v_check_any(((v0 | v1) != v_zero));
src += VTraits<v_type>::vlanes();
res = v_check_any((v_ne(v_or(v0, v1), v_zero)));
}
v_cleanup();
@ -114,11 +114,11 @@ static bool hasNonZero16u( const ushort* src, size_t len )
{
bool res = false;
const ushort* srcEnd = src+len;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef v_uint16 v_type;
const v_type v_zero = vx_setzero_u16();
constexpr const int unrollCount = 4;
int step = v_type::nlanes * unrollCount;
int step = VTraits<v_type>::vlanes() * unrollCount;
int len0 = len & -step;
const ushort* srcSimdEnd = src+len0;
@ -126,16 +126,16 @@ static bool hasNonZero16u( const ushort* src, size_t len )
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v1 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v2 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v3 = vx_load(src);
src += v_type::nlanes;
v0 |= v1;
v2 |= v3;
res = v_check_any(((v0 | v2) != v_zero));
src += VTraits<v_type>::vlanes();
v0 = v_or(v0, v1);
v2 = v_or(v2, v3);
res = v_check_any((v_ne(v_or(v0, v2), v_zero)));
}
v_cleanup();
@ -147,11 +147,11 @@ static bool hasNonZero32s( const int* src, size_t len )
{
bool res = false;
const int* srcEnd = src+len;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef v_int32 v_type;
const v_type v_zero = vx_setzero_s32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int step = VTraits<v_type>::vlanes() * unrollCount;
int len0 = len & -step;
const int* srcSimdEnd = src+len0;
@ -159,29 +159,29 @@ static bool hasNonZero32s( const int* src, size_t len )
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v1 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v2 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v3 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v4 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v5 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v6 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v7 = vx_load(src);
src += v_type::nlanes;
v0 |= v1;
v2 |= v3;
v4 |= v5;
v6 |= v7;
src += VTraits<v_type>::vlanes();
v0 = v_or(v0, v1);
v2 = v_or(v2, v3);
v4 = v_or(v4, v5);
v6 = v_or(v6, v7);
v0 |= v2;
v4 |= v6;
res = v_check_any(((v0 | v4) != v_zero));
v0 = v_or(v0, v2);
v4 = v_or(v4, v6);
res = v_check_any((v_ne(v_or(v0, v4), v_zero)));
}
v_cleanup();
@ -193,11 +193,11 @@ static bool hasNonZero32f( const float* src, size_t len )
{
bool res = false;
const float* srcEnd = src+len;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
typedef v_float32 v_type;
const v_type v_zero = vx_setzero_f32();
constexpr const int unrollCount = 8;
int step = v_type::nlanes * unrollCount;
int step = VTraits<v_type>::vlanes() * unrollCount;
int len0 = len & -step;
const float* srcSimdEnd = src+len0;
@ -205,30 +205,30 @@ static bool hasNonZero32f( const float* src, size_t len )
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v1 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v2 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v3 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v4 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v5 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v6 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v7 = vx_load(src);
src += v_type::nlanes;
v0 |= v1;
v2 |= v3;
v4 |= v5;
v6 |= v7;
src += VTraits<v_type>::vlanes();
v0 = v_or(v0, v1);
v2 = v_or(v2, v3);
v4 = v_or(v4, v5);
v6 = v_or(v6, v7);
v0 |= v2;
v4 |= v6;
v0 = v_or(v0, v2);
v4 = v_or(v4, v6);
//res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v4) == v_zero));
res = !v_check_all((v_eq(v_or(v0, v4), v_zero)));
}
v_cleanup();
@ -240,11 +240,11 @@ static bool hasNonZero64f( const double* src, size_t len )
{
bool res = false;
const double* srcEnd = src+len;
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
typedef v_float64 v_type;
const v_type v_zero = vx_setzero_f64();
constexpr const int unrollCount = 16;
int step = v_type::nlanes * unrollCount;
int step = VTraits<v_type>::vlanes() * unrollCount;
int len0 = len & -step;
const double* srcSimdEnd = src+len0;
@ -252,55 +252,55 @@ static bool hasNonZero64f( const double* src, size_t len )
while(!res && countSIMD--)
{
v_type v0 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v1 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v2 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v3 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v4 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v5 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v6 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v7 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v8 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v9 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v10 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v11 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v12 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v13 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v14 = vx_load(src);
src += v_type::nlanes;
src += VTraits<v_type>::vlanes();
v_type v15 = vx_load(src);
src += v_type::nlanes;
v0 |= v1;
v2 |= v3;
v4 |= v5;
v6 |= v7;
v8 |= v9;
v10 |= v11;
v12 |= v13;
v14 |= v15;
src += VTraits<v_type>::vlanes();
v0 = v_or(v0, v1);
v2 = v_or(v2, v3);
v4 = v_or(v4, v5);
v6 = v_or(v6, v7);
v8 = v_or(v8, v9);
v10 = v_or(v10, v11);
v12 = v_or(v12, v13);
v14 = v_or(v14, v15);
v0 |= v2;
v4 |= v6;
v8 |= v10;
v12 |= v14;
v0 = v_or(v0, v2);
v4 = v_or(v4, v6);
v8 = v_or(v8, v10);
v12 = v_or(v12, v14);
v0 |= v4;
v8 |= v12;
v0 = v_or(v0, v4);
v8 = v_or(v8, v12);
//res = v_check_any(((v0 | v8) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
res = !v_check_all(((v0 | v8) == v_zero));
res = !v_check_all((v_eq(v_or(v0, v8), v_zero)));
}
v_cleanup();

View File

@ -276,7 +276,7 @@ template<typename T> struct VBLAS
int givens(T*, T*, int, T, T) const { return 0; }
};
#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE_64F
#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
{
if( n < 2*VTraits<v_float32>::vlanes() )

View File

@ -2549,6 +2549,7 @@ double dotProd_16s(const short* src1, const short* src2, int len)
double dotProd_32s(const int* src1, const int* src2, int len)
{
#if CV_SIMD_64F // TODO: enable for CV_SIMD_SCALABLE_64F
// Test failed on RVV(QEMU): Too big difference (=1.20209e-08 > 1.11022e-12)
double r = .0;
int i = 0;
const int step = VTraits<v_int32>::vlanes();

View File

@ -4,6 +4,7 @@
#include "precomp.hpp"
#include "opencl_kernels_core.hpp"
#include "hal_replacement.hpp"
#include "opencv2/core/detail/dispatch_helper.impl.hpp"
#include <algorithm> // std::swap_ranges
@ -802,6 +803,9 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
_dst.create( size, type );
Mat dst = _dst.getMat();
CALL_HAL(flip, cv_hal_flip, type, src.ptr(), src.step, src.cols, src.rows,
dst.ptr(), dst.step, flip_mode);
CV_IPP_RUN_FAST(ipp_flip(src, dst, flip_mode));
size_t esz = CV_ELEM_SIZE(type);
@ -1075,10 +1079,8 @@ void broadcast(InputArray _src, InputArray _shape, OutputArray _dst) {
}
}
void rotate(InputArray _src, OutputArray _dst, int rotateMode)
static void rotateImpl(InputArray _src, OutputArray _dst, int rotateMode)
{
CV_Assert(_src.dims() <= 2);
switch (rotateMode)
{
case ROTATE_90_CLOCKWISE:
@ -1097,4 +1099,51 @@ void rotate(InputArray _src, OutputArray _dst, int rotateMode)
}
}
void rotate(InputArray _src, OutputArray _dst, int rotateMode)
{
CV_Assert(_src.dims() <= 2);
int angle;
if (_dst.isUMat())
{
rotateImpl(_src, _dst, rotateMode);
return;
}
Mat src = _src.getMat();
int type = src.type();
if( src.empty() )
{
_dst.release();
return;
}
switch (rotateMode)
{
case ROTATE_90_CLOCKWISE:
_dst.create(src.cols, src.rows, type);
angle = 90;
break;
case ROTATE_180:
_dst.create(src.rows, src.cols, type);
angle = 180;
break;
case ROTATE_90_COUNTERCLOCKWISE:
_dst.create(src.cols, src.rows, type);
angle = 270;
break;
default:
_dst.create(src.rows, src.cols, type);
angle = 0;
break;
}
Mat dst = _dst.getMat();
CALL_HAL(rotate90, cv_hal_rotate90, type, src.ptr(), src.step, src.cols, src.rows,
dst.ptr(), dst.step, angle);
// use src (Mat) since _src (InputArray) is updated by _dst.create() when in-place
rotateImpl(src, _dst, rotateMode);
}
} // namespace

View File

@ -24,7 +24,7 @@ struct SumSqr_SIMD
}
};
#if CV_SIMD || CV_SIMD_SCALABLE
#if (CV_SIMD || CV_SIMD_SCALABLE)
template <>
struct SumSqr_SIMD<uchar, int, int>

View File

@ -1546,9 +1546,9 @@ void cv::minMaxIdx(InputArray _src, double* minVal,
if (!src.empty() && mask.empty())
{
if( minidx == 0 )
minidx = 1;
if( maxidx == 0 )
maxidx = 1;
minidx = 1;
if( maxidx == 0 )
maxidx = 1;
}
if( minidx == 0 )

View File

@ -791,7 +791,7 @@ int getThreadNum()
return 0;
#endif
#elif defined HAVE_HPX
return (int)(hpx::get_num_worker_threads());
return (int)(hpx::get_num_worker_threads());
#elif defined HAVE_OPENMP
return omp_get_thread_num();
#elif defined HAVE_GCD

View File

@ -367,4 +367,4 @@ size_t base64::RawDataToBinaryConvertor::make_to_binary_funcs(const std::string
return offset_packed;
}
}
}

View File

@ -124,4 +124,4 @@ private:
}
}
#endif
#endif

View File

@ -306,9 +306,6 @@ softdouble cos(const softdouble& a) { return f64_cos(a); }
| The values to return on conversions to 32-bit integer formats that raise an
| invalid exception.
*----------------------------------------------------------------------------*/
#define ui32_fromPosOverflow 0xFFFFFFFF
#define ui32_fromNegOverflow 0
#define ui32_fromNaN 0xFFFFFFFF
#define i32_fromPosOverflow 0x7FFFFFFF
#define i32_fromNegOverflow (-0x7FFFFFFF - 1)
#define i32_fromNaN 0x7FFFFFFF
@ -317,9 +314,6 @@ softdouble cos(const softdouble& a) { return f64_cos(a); }
| The values to return on conversions to 64-bit integer formats that raise an
| invalid exception.
*----------------------------------------------------------------------------*/
#define ui64_fromPosOverflow UINT64_C( 0xFFFFFFFFFFFFFFFF )
#define ui64_fromNegOverflow 0
#define ui64_fromNaN UINT64_C( 0xFFFFFFFFFFFFFFFF )
#define i64_fromPosOverflow UINT64_C( 0x7FFFFFFFFFFFFFFF )
//fixed unsigned unary minus: -x == ~x + 1
//#define i64_fromNegOverflow (-UINT64_C( 0x7FFFFFFFFFFFFFFF ) - 1)
@ -422,34 +416,6 @@ struct uint64_extra { uint64_t v, extra; };
struct uint128_extra { struct uint128 v; uint64_t extra; };
#endif
/*----------------------------------------------------------------------------
| These macros are used to isolate the differences in word order between big-
| endian and little-endian platforms.
*----------------------------------------------------------------------------*/
#ifndef WORDS_BIGENDIAN
#define wordIncr 1
#define indexWord( total, n ) (n)
#define indexWordHi( total ) ((total) - 1)
#define indexWordLo( total ) 0
#define indexMultiword( total, m, n ) (n)
#define indexMultiwordHi( total, n ) ((total) - (n))
#define indexMultiwordLo( total, n ) 0
#define indexMultiwordHiBut( total, n ) (n)
#define indexMultiwordLoBut( total, n ) 0
#define INIT_UINTM4( v3, v2, v1, v0 ) { v0, v1, v2, v3 }
#else
#define wordIncr -1
#define indexWord( total, n ) ((total) - 1 - (n))
#define indexWordHi( total ) 0
#define indexWordLo( total ) ((total) - 1)
#define indexMultiword( total, m, n ) ((total) - 1 - (m))
#define indexMultiwordHi( total, n ) 0
#define indexMultiwordLo( total, n ) ((total) - (n))
#define indexMultiwordHiBut( total, n ) 0
#define indexMultiwordLoBut( total, n ) (n)
#define INIT_UINTM4( v3, v2, v1, v0 ) { v3, v2, v1, v0 }
#endif
enum {
softfloat_mulAdd_subC = 1,
softfloat_mulAdd_subProd = 2

View File

@ -220,4 +220,4 @@ void split64s(const int64* src, int64** dst, int len, int cn )
#endif
CV_CPU_OPTIMIZATION_NAMESPACE_END
}} // namespace
}} // namespace

View File

@ -672,7 +672,7 @@ static void inRangeS(const Mat& src, const Scalar& lb, const Scalar& rb, Mat& ds
}
} // namespace
CVTEST_GUARD_SYMBOL(inRange);
CVTEST_GUARD_SYMBOL(inRange)
struct InRangeSOp : public BaseArithmOp
{
@ -1202,7 +1202,7 @@ struct MeanOp : public BaseArithmOp
MeanOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
{
context = 3;
};
}
void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
{
dst.create(1, 1, CV_64FC4);
@ -1225,7 +1225,7 @@ struct SumOp : public BaseArithmOp
SumOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
{
context = 3;
};
}
void op(const vector<Mat>& src, Mat& dst, const Mat&)
{
dst.create(1, 1, CV_64FC4);
@ -1285,7 +1285,7 @@ struct MeanStdDevOp : public BaseArithmOp
{
cn = 0;
context = 7;
};
}
void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
{
dst.create(1, 2, CV_64FC4);
@ -1326,7 +1326,7 @@ struct NormOp : public BaseArithmOp
{
context = 1;
normType = 0;
};
}
int getRandomType(RNG& rng)
{
int type = cvtest::randomType(rng, baseArithmTypeMask, 1, 4);
@ -1372,7 +1372,7 @@ struct MinMaxLocOp : public BaseArithmOp
MinMaxLocOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
{
context = ARITHM_MAX_NDIMS*2 + 2;
};
}
int getRandomType(RNG& rng)
{
return cvtest::randomType(rng, baseArithmTypeMask, 1, 1);
@ -1419,7 +1419,7 @@ struct reduceArgMinMaxOp : public BaseArithmOp
isLast(false), isMax(false), axis(0)
{
context = ARITHM_MAX_NDIMS*2 + 2;
};
}
int getRandomType(RNG& rng) override
{
return cvtest::randomType(rng, baseArithmTypeMask, 1, 1);

View File

@ -435,6 +435,8 @@ protected:
CV_Assert( ov1 == v1 );
CV_Assert( osc1 == sc1 );
CV_Assert( og1 == g1 );
fs.release();
remove(fname.c_str());
}
catch(...)
{
@ -489,6 +491,7 @@ TEST(Core_InputOutput, FileStorage)
char arr[66];
snprintf(arr, sizeof(arr), "snprintf is hell %d", 666);
EXPECT_NO_THROW(f << arr);
remove(file.c_str());
}
TEST(Core_InputOutput, FileStorageKey)
@ -534,6 +537,7 @@ TEST(Core_InputOutput, FileStorageSpaces)
ASSERT_STREQ(values[i].c_str(), valuesReadAppend[i].c_str());
}
g3.release();
EXPECT_EQ(0, remove(fileName.c_str()));
}
struct data_t
@ -585,12 +589,15 @@ struct data_t
static void test_filestorage_basic(int write_flags, const char* suffix_name, bool testReadWrite, bool useMemory = false)
{
const bool generateTestData = false; // enable to regenerate reference in opencv_extra
const ::testing::TestInfo* const test_info = ::testing::UnitTest::GetInstance()->current_test_info();
CV_Assert(test_info);
std::string name = (std::string(test_info->test_case_name()) + "--" + test_info->name() + suffix_name);
std::string name_34 = string(cvtest::TS::ptr()->get_data_path()) + "io/3_4/" + name;
if (!testReadWrite)
if (!testReadWrite || generateTestData)
name = string(cvtest::TS::ptr()->get_data_path()) + "io/" + name;
else
name = cv::tempfile(name.c_str());
{
const size_t rawdata_N = 40;
@ -636,10 +643,7 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
rawdata.push_back(tmp);
}
}
#ifdef GENERATE_TEST_DATA
#else
if (testReadWrite || useMemory)
#endif
if (testReadWrite || useMemory || generateTestData)
{
cv::FileStorage fs(name, write_flags + (useMemory ? cv::FileStorage::MEMORY : 0));
fs << "normal_2d_mat" << _2d_out;
@ -761,9 +765,13 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
ASSERT_EQ(_rd_in.dims , _rd_out.dims);
ASSERT_EQ(_rd_in.depth(), _rd_out.depth());
if (useMemory) {
if (useMemory)
{
EXPECT_EQ(0, cv::norm(_rd_in, _rd_out, NORM_INF));
}
if (testReadWrite && !useMemory && !generateTestData) {
EXPECT_EQ(0, remove(name.c_str()));
}
}
}
@ -810,7 +818,7 @@ TEST(Core_InputOutput, filestorage_heap_overflow)
const ::testing::TestInfo* const test_info = ::testing::UnitTest::GetInstance()->current_test_info();
CV_Assert(test_info);
std::string name = std::string(test_info->test_case_name()) + "--" + test_info->name();
std::string name = cv::tempfile();
const char data[] = {0x00, 0x2f, 0x4a, 0x4a, 0x50, 0x4a, 0x4a };
std::ofstream file;
@ -822,6 +830,7 @@ TEST(Core_InputOutput, filestorage_heap_overflow)
// This just shouldn't segfault, otherwise it's fine
EXPECT_ANY_THROW(FileStorage(name, FileStorage::READ));
EXPECT_EQ(0, remove(name.c_str()));
}
TEST(Core_InputOutput, filestorage_base64_valid_call)
@ -832,18 +841,6 @@ TEST(Core_InputOutput, filestorage_base64_valid_call)
: (std::string(test_info->test_case_name()) + "--" + test_info->name());
char const * filenames[] = {
"core_io_base64_other_test.yml",
"core_io_base64_other_test.xml",
"core_io_base64_other_test.json",
"core_io_base64_other_test.yml?base64",
"core_io_base64_other_test.xml?base64",
"core_io_base64_other_test.json?base64",
0
};
char const * real_name[] = {
"core_io_base64_other_test.yml",
"core_io_base64_other_test.xml",
"core_io_base64_other_test.json",
"core_io_base64_other_test.yml",
"core_io_base64_other_test.xml",
"core_io_base64_other_test.json",
@ -855,14 +852,16 @@ TEST(Core_InputOutput, filestorage_base64_valid_call)
for (int n = 0; n < 6; n++)
{
char const* suffix_name = filenames[n];
SCOPED_TRACE(suffix_name);
std::string name = basename + '_' + suffix_name;
std::string file_name = basename + '_' + real_name[n];
const int idx = n / 2;
const std::string mode_suffix = (n % 2 == 0) ? "" : "?base64";
std::string suffix_name = basename + "_" + filenames[idx];
std::string file_name = cv::tempfile(suffix_name.c_str());
std::string mode_file_name = file_name + mode_suffix;
SCOPED_TRACE(mode_file_name);
EXPECT_NO_THROW(
{
cv::FileStorage fs(name, cv::FileStorage::WRITE_BASE64);
cv::FileStorage fs(mode_file_name, cv::FileStorage::WRITE_BASE64);
fs << "manydata" << "[";
fs << "[:";
@ -890,7 +889,7 @@ TEST(Core_InputOutput, filestorage_base64_valid_call)
EXPECT_NO_THROW(
{
cv::FileStorage fs(name, cv::FileStorage::WRITE);
cv::FileStorage fs(mode_file_name, cv::FileStorage::WRITE);
fs << "manydata" << "[";
fs << str_out;
@ -934,10 +933,10 @@ TEST(Core_InputOutput, filestorage_base64_invalid_call)
0
};
for (char const ** ptr = filenames; *ptr; ptr++)
for (int idx = 0; idx < 3; ++idx)
{
char const * suffix_name = *ptr;
std::string name = basename + '_' + suffix_name;
const string base_suffix = basename + '_' + filenames[idx];
std::string name = cv::tempfile(base_suffix.c_str());
EXPECT_NO_THROW({
cv::FileStorage fs(name, cv::FileStorage::WRITE);
@ -958,7 +957,7 @@ TEST(Core_InputOutput, filestorage_base64_invalid_call)
TEST(Core_InputOutput, filestorage_yml_vec2i)
{
const std::string file_name = "vec2i.yml";
const std::string file_name = cv::tempfile("vec2i.yml");
cv::Vec2i vec(2, 1), ovec;
/* write */
@ -1040,7 +1039,7 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)
}
}
String fileName = "vec_vec_io_test.";
String basename = "vec_vec_io_test.";
std::vector<String> formats;
formats.push_back("xml");
@ -1049,11 +1048,13 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)
for(size_t i = 0; i < formats.size(); i++)
{
FileStorage writer(fileName + formats[i], FileStorage::WRITE);
const String basename_plus(basename + formats[i]);
const String fileName = tempfile(basename_plus.c_str());
FileStorage writer(fileName, FileStorage::WRITE);
writer << "vecVecMat" << outputMats;
writer.release();
FileStorage reader(fileName + formats[i], FileStorage::READ);
FileStorage reader(fileName, FileStorage::READ);
std::vector<std::vector<Mat> > testMats;
reader["vecVecMat"] >> testMats;
@ -1070,7 +1071,7 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)
}
reader.release();
remove((fileName + formats[i]).c_str());
remove(fileName.c_str());
}
}
@ -1661,7 +1662,7 @@ TEST(Core_InputOutput, FileStorage_json_bool)
TEST(Core_InputOutput, FileStorage_free_file_after_exception)
{
const std::string fileName = "FileStorage_free_file_after_exception_test.yml";
const std::string fileName = cv::tempfile("FileStorage_free_file_after_exception_test.yml");
const std::string content = "%YAML:1.0\n cameraMatrix;:: !<tag:yaml.org,2002:opencv-matrix>\n";
std::fstream testFile;
@ -1684,11 +1685,11 @@ TEST(Core_InputOutput, FileStorage_free_file_after_exception)
TEST(Core_InputOutput, FileStorage_write_to_sequence)
{
const std::vector<std::string> formatExts = { ".yml", ".json", ".xml" };
const std::string fileName = "FileStorage_write_to_sequence";
for (const auto& ext : formatExts)
{
FileStorage fs(fileName + ext, FileStorage::WRITE);
const std::string name = tempfile(ext.c_str());
FileStorage fs(name, FileStorage::WRITE);
std::vector<int> in = { 23, 42 };
fs.startWriteStruct("some_sequence", cv::FileNode::SEQ);
for (int i : in)
@ -1696,7 +1697,7 @@ TEST(Core_InputOutput, FileStorage_write_to_sequence)
fs.endWriteStruct();
fs.release();
FileStorage fsIn(fileName + ext, FileStorage::READ);
FileStorage fsIn(name, FileStorage::READ);
FileNode seq = fsIn["some_sequence"];
FileNodeIterator it = seq.begin(), it_end = seq.end();
std::vector<int> out;
@ -1704,12 +1705,13 @@ TEST(Core_InputOutput, FileStorage_write_to_sequence)
out.push_back((int)*it);
EXPECT_EQ(in, out);
EXPECT_EQ(0, remove(name.c_str()));
}
}
TEST(Core_InputOutput, FileStorage_YAML_parse_multiple_documents)
{
const std::string filename = "FileStorage_YAML_parse_multiple_documents.yml";
const std::string filename = cv::tempfile("FileStorage_YAML_parse_multiple_documents.yml");
FileStorage fs;
fs.open(filename, FileStorage::WRITE);

View File

@ -475,12 +475,13 @@ TEST(Core_PCA, accuracy)
ASSERT_LE(err, diffBackPrjEps) << "bad accuracy of cvBackProjectPCA() (CV_PCA_DATA_AS_COL)";
#endif
// Test read and write
FileStorage fs( "PCA_store.yml", FileStorage::WRITE );
const std::string filename = cv::tempfile("PCA_store.yml");
FileStorage fs( filename, FileStorage::WRITE );
rPCA.write( fs );
fs.release();
PCA lPCA;
fs.open( "PCA_store.yml", FileStorage::READ );
fs.open( filename, FileStorage::READ );
lPCA.read( fs.root() );
err = cvtest::norm(rPCA.eigenvectors, lPCA.eigenvectors, NORM_L2 | NORM_RELATIVE);
EXPECT_LE(err, 0) << "bad accuracy of write/load functions (YML)";
@ -488,6 +489,7 @@ TEST(Core_PCA, accuracy)
EXPECT_LE(err, 0) << "bad accuracy of write/load functions (YML)";
err = cvtest::norm(rPCA.mean, lPCA.mean, NORM_L2 | NORM_RELATIVE);
EXPECT_LE(err, 0) << "bad accuracy of write/load functions (YML)";
EXPECT_EQ(0, remove(filename.c_str()));
}
class Core_ArrayOpTest : public cvtest::BaseTest

View File

@ -588,11 +588,11 @@ CV__DNN_INLINE_NS_BEGIN
{
public:
virtual void forwardSlice(const float* src, float* dst, int len,
size_t outPlaneSize, int cn0, int cn1) const {};
size_t outPlaneSize, int cn0, int cn1) const {}
virtual void forwardSlice(const int* src, const int* lut, int* dst, int len,
size_t outPlaneSize, int cn0, int cn1) const {};
size_t outPlaneSize, int cn0, int cn1) const {}
virtual void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len,
size_t outPlaneSize, int cn0, int cn1) const {};
size_t outPlaneSize, int cn0, int cn1) const {}
};
class CV_EXPORTS ReLULayer : public ActivationLayer

View File

@ -28,22 +28,28 @@ public:
target = (dnn::Target)(int)get<1>(GetParam());
}
void processNet(std::string weights, std::string proto, const Mat& input, const std::string& outputLayer = "")
{
randu(input, 0.0f, 1.0f);
void processNet(std::string weights, std::string proto,
const std::vector<std::tuple<Mat, std::string>>& inputs, const std::string& outputLayer = ""){
weights = findDataFile(weights, false);
if (!proto.empty())
proto = findDataFile(proto);
net = readNet(proto, weights);
net.setInput(blobFromImage(input, 1.0, Size(), Scalar(), false));
// Set multiple inputs
for(auto &inp: inputs){
net.setInput(std::get<0>(inp), std::get<1>(inp));
}
net.setPreferableBackend(backend);
net.setPreferableTarget(target);
MatShape netInputShape = shape(1, 3, input.rows, input.cols);
// Calculate multiple inputs memory consumption
std::vector<MatShape> netMatShapes;
for(auto &inp: inputs){
netMatShapes.push_back(shape(std::get<0>(inp)));
}
size_t weightsMemory = 0, blobsMemory = 0;
net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
int64 flops = net.getFLOPS(netInputShape);
net.getMemoryConsumption(netMatShapes, weightsMemory, blobsMemory);
int64 flops = net.getFLOPS(netMatShapes);
CV_Assert(flops > 0);
net.forward(outputLayer); // warmup
@ -59,33 +65,48 @@ public:
SANITY_CHECK_NOTHING();
}
void processNet(std::string weights, std::string proto,
Mat &input, const std::string& outputLayer = "")
{
processNet(weights, proto, {std::make_tuple(input, "")}, outputLayer);
}
void processNet(std::string weights, std::string proto,
Size inpSize, const std::string& outputLayer = "")
{
Mat input_data(inpSize, CV_32FC3);
randu(input_data, 0.0f, 1.0f);
Mat input = blobFromImage(input_data, 1.0, Size(), Scalar(), false);
processNet(weights, proto, input, outputLayer);
}
};
PERF_TEST_P_(DNNTestNetwork, AlexNet)
{
processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt", Mat(cv::Size(227, 227), CV_32FC3));
processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt", cv::Size(227, 227));
}
PERF_TEST_P_(DNNTestNetwork, GoogLeNet)
{
processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt", Mat(cv::Size(224, 224), CV_32FC3));
processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt", cv::Size(224, 224));
}
PERF_TEST_P_(DNNTestNetwork, ResNet_50)
{
processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt", Mat(cv::Size(224, 224), CV_32FC3));
processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt", cv::Size(224, 224));
}
PERF_TEST_P_(DNNTestNetwork, SqueezeNet_v1_1)
{
processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt", Mat(cv::Size(227, 227), CV_32FC3));
processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt", cv::Size(227, 227));
}
PERF_TEST_P_(DNNTestNetwork, Inception_5h)
{
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) throw SkipTestException("");
processNet("dnn/tensorflow_inception_graph.pb", "", Mat(cv::Size(224, 224), CV_32FC3), "softmax2");
processNet("dnn/tensorflow_inception_graph.pb", "", cv::Size(224, 224), "softmax2");
}
PERF_TEST_P_(DNNTestNetwork, ENet)
@ -97,12 +118,12 @@ PERF_TEST_P_(DNNTestNetwork, ENet)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
throw SkipTestException("");
#endif
processNet("dnn/Enet-model-best.net", "", Mat(cv::Size(512, 256), CV_32FC3));
processNet("dnn/Enet-model-best.net", "", cv::Size(512, 256));
}
PERF_TEST_P_(DNNTestNetwork, SSD)
{
processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", "dnn/ssd_vgg16.prototxt", Mat(cv::Size(300, 300), CV_32FC3));
processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", "dnn/ssd_vgg16.prototxt", cv::Size(300, 300));
}
PERF_TEST_P_(DNNTestNetwork, OpenFace)
@ -111,27 +132,27 @@ PERF_TEST_P_(DNNTestNetwork, OpenFace)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_HDDL))
throw SkipTestException("");
#endif
processNet("dnn/openface_nn4.small2.v1.t7", "", Mat(cv::Size(96, 96), CV_32FC3));
processNet("dnn/openface_nn4.small2.v1.t7", "", cv::Size(96, 96));
}
PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
{
processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt", Mat(cv::Size(300, 300), CV_32FC3));
processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt", cv::Size(300, 300));
}
PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
{
processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "ssd_mobilenet_v1_coco_2017_11_17.pbtxt", Mat(cv::Size(300, 300), CV_32FC3));
processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "ssd_mobilenet_v1_coco_2017_11_17.pbtxt", cv::Size(300, 300));
}
PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
{
processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "ssd_mobilenet_v2_coco_2018_03_29.pbtxt", Mat(cv::Size(300, 300), CV_32FC3));
processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "ssd_mobilenet_v2_coco_2018_03_29.pbtxt", cv::Size(300, 300));
}
PERF_TEST_P_(DNNTestNetwork, DenseNet_121)
{
processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Mat(cv::Size(224, 224), CV_32FC3));
processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", cv::Size(224, 224));
}
PERF_TEST_P_(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
@ -140,17 +161,17 @@ PERF_TEST_P_(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
throw SkipTestException("");
// The same .caffemodel but modified .prototxt
// See https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/pose/poseParameters.cpp
processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt", Mat(cv::Size(368, 368), CV_32FC3));
processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt", cv::Size(368, 368));
}
PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)
{
processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt", Mat(cv::Size(300, 300), CV_32FC3));
processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt", cv::Size(300, 300));
}
PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
{
processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", Mat(cv::Size(300, 300), CV_32FC3));
processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", cv::Size(300, 300));
}
PERF_TEST_P_(DNNTestNetwork, YOLOv3)
@ -168,9 +189,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3)
#endif
Mat sample = imread(findDataFile("dnn/dog416.png"));
cvtColor(sample, sample, COLOR_BGR2RGB);
Mat inp;
sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(), Scalar(), true);
processNet("dnn/yolov3.weights", "dnn/yolov3.cfg", inp);
}
@ -186,9 +205,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4)
throw SkipTestException("Test is disabled in OpenVINO 2020.4");
#endif
Mat sample = imread(findDataFile("dnn/dog416.png"));
cvtColor(sample, sample, COLOR_BGR2RGB);
Mat inp;
sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(), Scalar(), true);
processNet("dnn/yolov4.weights", "dnn/yolov4.cfg", inp);
}
@ -199,20 +216,39 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4_tiny)
throw SkipTestException("");
#endif
Mat sample = imread(findDataFile("dnn/dog416.png"));
cvtColor(sample, sample, COLOR_BGR2RGB);
Mat inp;
sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(), Scalar(), true);
processNet("dnn/yolov4-tiny-2020-12.weights", "dnn/yolov4-tiny-2020-12.cfg", inp);
}
PERF_TEST_P_(DNNTestNetwork, YOLOv5) {
applyTestTag(CV_TEST_TAG_MEMORY_512MB);
Mat sample = imread(findDataFile("dnn/dog416.png"));
Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
processNet("", "dnn/yolov5n.onnx", inp);
}
PERF_TEST_P_(DNNTestNetwork, YOLOv8) {
applyTestTag(CV_TEST_TAG_MEMORY_512MB);
Mat sample = imread(findDataFile("dnn/dog416.png"));
Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
processNet("", "dnn/yolov8n.onnx", inp);
}
PERF_TEST_P_(DNNTestNetwork, YOLOX) {
applyTestTag(CV_TEST_TAG_MEMORY_512MB);
Mat sample = imread(findDataFile("dnn/dog416.png"));
Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
processNet("", "dnn/yolox_s.onnx", inp);
}
PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
{
processNet("dnn/frozen_east_text_detection.pb", "", Mat(cv::Size(320, 320), CV_32FC3));
processNet("dnn/frozen_east_text_detection.pb", "", cv::Size(320, 320));
}
PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
{
processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", Mat(cv::Size(320, 240), CV_32FC3));
processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", cv::Size(320, 240));
}
PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
@ -233,7 +269,8 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
(backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException("");
processNet("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb",
"dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", Mat(cv::Size(800, 600), CV_32FC3));
"dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt",
cv::Size(800, 600));
}
PERF_TEST_P_(DNNTestNetwork, EfficientDet)
@ -241,12 +278,88 @@ PERF_TEST_P_(DNNTestNetwork, EfficientDet)
if (target != DNN_TARGET_CPU)
throw SkipTestException("");
Mat sample = imread(findDataFile("dnn/dog416.png"));
resize(sample, sample, Size(512, 512));
Mat inp;
sample.convertTo(inp, CV_32FC3, 1.0/255);
Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(512, 512), Scalar(), true);
processNet("dnn/efficientdet-d0.pb", "dnn/efficientdet-d0.pbtxt", inp);
}
PERF_TEST_P_(DNNTestNetwork, EfficientNet)
{
Mat sample = imread(findDataFile("dnn/dog416.png"));
Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(224, 224), Scalar(), true);
transposeND(inp, {0, 2, 3, 1}, inp);
processNet("", "dnn/efficientnet-lite4.onnx", inp);
}
PERF_TEST_P_(DNNTestNetwork, YuNet) {
processNet("", "dnn/onnx/models/yunet-202303.onnx", cv::Size(640, 640));
}
PERF_TEST_P_(DNNTestNetwork, SFace) {
processNet("", "dnn/face_recognition_sface_2021dec.onnx", cv::Size(112, 112));
}
PERF_TEST_P_(DNNTestNetwork, MPPalm) {
Mat inp(cv::Size(192, 192), CV_32FC3);
randu(inp, 0.0f, 1.0f);
inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
transposeND(inp, {0, 2, 3, 1}, inp);
processNet("", "dnn/palm_detection_mediapipe_2023feb.onnx", inp);
}
PERF_TEST_P_(DNNTestNetwork, MPHand) {
Mat inp(cv::Size(224, 224), CV_32FC3);
randu(inp, 0.0f, 1.0f);
inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
transposeND(inp, {0, 2, 3, 1}, inp);
processNet("", "dnn/handpose_estimation_mediapipe_2023feb.onnx", inp);
}
PERF_TEST_P_(DNNTestNetwork, MPPose) {
Mat inp(cv::Size(256, 256), CV_32FC3);
randu(inp, 0.0f, 1.0f);
inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
transposeND(inp, {0, 2, 3, 1}, inp);
processNet("", "dnn/pose_estimation_mediapipe_2023mar.onnx", inp);
}
PERF_TEST_P_(DNNTestNetwork, PPOCRv3) {
applyTestTag(CV_TEST_TAG_MEMORY_512MB);
processNet("", "dnn/onnx/models/PP_OCRv3_DB_text_det.onnx", cv::Size(736, 736));
}
PERF_TEST_P_(DNNTestNetwork, PPHumanSeg) {
processNet("", "dnn/human_segmentation_pphumanseg_2023mar.onnx", cv::Size(192, 192));
}
PERF_TEST_P_(DNNTestNetwork, CRNN) {
Mat inp(cv::Size(100, 32), CV_32FC1);
randu(inp, 0.0f, 1.0f);
inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
processNet("", "dnn/text_recognition_CRNN_EN_2021sep.onnx", inp);
}
PERF_TEST_P_(DNNTestNetwork, ViTTrack) {
Mat inp1(cv::Size(128, 128), CV_32FC3);
Mat inp2(cv::Size(256, 256), CV_32FC3);
randu(inp1, 0.0f, 1.0f);
randu(inp2, 0.0f, 1.0f);
inp1 = blobFromImage(inp1, 1.0, Size(), Scalar(), false);
inp2 = blobFromImage(inp2, 1.0, Size(), Scalar(), false);
processNet("", "dnn/onnx/models/vitTracker.onnx", {std::make_tuple(inp1, "template"), std::make_tuple(inp2, "search")});
}
PERF_TEST_P_(DNNTestNetwork, EfficientDet_int8)
{
if (target != DNN_TARGET_CPU || (backend != DNN_BACKEND_OPENCV &&
backend != DNN_BACKEND_TIMVX && backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)) {
throw SkipTestException("");
}
Mat inp = imread(findDataFile("dnn/dog416.png"));
inp = blobFromImage(inp, 1.0 / 255.0, Size(320, 320), Scalar(), true);
processNet("", "dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", inp);
}
INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets());
} // namespace

View File

@ -17,7 +17,7 @@ namespace cv { namespace dnn {
class ImportNodeWrapper
{
public:
virtual ~ImportNodeWrapper() {};
virtual ~ImportNodeWrapper() {}
virtual int getNumInputs() const = 0;
@ -33,7 +33,7 @@ public:
class ImportGraphWrapper
{
public:
virtual ~ImportGraphWrapper() {};
virtual ~ImportGraphWrapper() {}
virtual Ptr<ImportNodeWrapper> getNode(int idx) const = 0;

View File

@ -590,7 +590,7 @@ void InfEngineNgraphNet::init(Target targetId)
allBlobs[name] = ov::Tensor(src.get_element_type(), outShape, src.data());
}
ppp.output(i++).tensor().set_element_type(ov::element::f32); // Should be always FP32
ppp.output(i++).tensor().set_element_type(src.get_element_type());
}
ppp.build();
@ -840,6 +840,8 @@ ov::Tensor wrapToNgraphBlob(const Mat& m) {
return ov::Tensor(ov::element::f32, shape, m.data);
else if (m.type() == CV_8U)
return ov::Tensor(ov::element::u8, shape, m.data);
else if (m.type() == CV_8SC1)
return ov::Tensor(ov::element::i8, shape, m.data);
else if (m.type() == CV_32SC1)
return ov::Tensor(ov::element::i32, shape, m.data);
else
@ -1234,6 +1236,32 @@ void InfEngineNgraphNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlo
#endif // OpenVINO >= 2022.1
}
ngraph::Output<ngraph::Node> ngraphQuantize(ngraph::Output<ngraph::Node> input, float output_sc, float output_zp) {
float outLow = -128, outHigh = 127;
float inpLow = output_sc * (outLow - output_zp);
float inpHigh = output_sc * (outHigh - output_zp);
return std::make_shared<ngraph::op::FakeQuantize>(input,
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &inpLow),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &inpHigh),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &outLow),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &outHigh),
256 // levels
);
}
ngraph::Output<ngraph::Node> ngraphDequantize(ngraph::Output<ngraph::Node> input, float input_sc, float input_zp) {
float inpLow = -128, inpHigh = 127;
float outLow = input_sc * (inpLow - input_zp);
float outHigh = input_sc * (inpHigh - input_zp);
return std::make_shared<ngraph::op::FakeQuantize>(input,
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &inpLow),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &inpHigh),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &outLow),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &outHigh),
256 // levels
);
}
#endif
}}

View File

@ -148,6 +148,9 @@ private:
InferenceEngine::CNNNetwork t_net;
};
ngraph::Output<ngraph::Node> ngraphQuantize(ngraph::Output<ngraph::Node> input, float output_sc, float output_zp);
ngraph::Output<ngraph::Node> ngraphDequantize(ngraph::Output<ngraph::Node> input, float input_sc, float input_zp);
#endif // HAVE_DNN_NGRAPH
}} // namespace cv::dnn

View File

@ -5,6 +5,7 @@
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "../op_timvx.hpp"
#include "../ie_ngraph.hpp"
#include <opencv2/dnn/shape_utils.hpp>
@ -110,7 +111,8 @@ public:
return true;
}
return backendId == DNN_BACKEND_OPENCV;
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
}
bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
@ -238,6 +240,27 @@ public:
return Ptr<BackendNode>();
}
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
input = ngraphDequantize(input, input_sc, input_zp);
std::vector<size_t> shape(input.get_shape().size(), 1);
shape[1] = origin_weights.total();
ngraph::Output<ngraph::Node> res;
auto ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, origin_weights.data);
auto ieBias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, origin_bias.data);
res = std::make_shared<ngraph::op::v1::Multiply>(input, ieWeights);
res = std::make_shared<ngraph::op::v1::Add>(res, ieBias);
res = ngraphQuantize(res, output_sc, output_zp);
return new InfEngineNgraphNode(res);
}
#endif // HAVE_DNN_NGRAPH
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();

View File

@ -10,6 +10,7 @@
#include "opencv2/core/hal/hal.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "../op_timvx.hpp"
#include "../ie_ngraph.hpp"
#include <iostream>
#include <numeric>
@ -18,7 +19,7 @@ namespace cv
namespace dnn
{
#if CV_SIMD
#if CV_SIMD128
static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
v_int32x4& out0, v_int32x4& out1, v_int32x4& out2, v_int32x4& out3)
{
@ -195,7 +196,8 @@ public:
}
#endif
// Only default backend and Conv1D/Conv2D/Conv3D are supported
return backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3;
return (backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3) ||
backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -561,6 +563,126 @@ public:
return Ptr<BackendNode>();
}
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
CV_Assert(!blobs.empty());
CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
CV_CheckTypeEQ(weightsMat.type(), CV_8S, "");
auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
std::vector<size_t> dims = ieInpNode.get_shape();
CV_Check(dims.size(), dims.size() >= 3 && dims.size() <= 5, "");
CV_Assert(ieInpNode.get_element_type() == ngraph::element::f32);
ngraph::Output<ngraph::Node> ieWeights;
if (nodes.size() > 1)
ieWeights = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
const int inpCn = dims[1];
const int inpGroupCn = nodes.size() > 1 ? ieWeights.get_shape()[1] : blobs[0].size[1];
const int group = inpCn / inpGroupCn;
std::vector<size_t> kernel_shape;
if (group != 1)
{
kernel_shape.push_back(group);
}
kernel_shape.push_back(numOutput / group);
kernel_shape.push_back(inpCn / group);
std::copy(kernel_size.begin(), kernel_size.end(), back_inserter(kernel_shape));
if (nodes.size() == 1)
{
ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::i8, kernel_shape, blobs[0].data);
}
else
{
auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
ngraph::Shape{kernel_shape.size()}, std::vector<int64_t>(kernel_shape.begin(), kernel_shape.end()));
ieWeights = std::make_shared<ngraph::op::v1::Reshape>(ieWeights, shape, true);
}
ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
if (!padMode.empty())
pad_type = padMode == "VALID" ? ngraph::op::PadType::VALID : ngraph::op::PadType::SAME_UPPER;
ieInpNode = ngraphDequantize(ieInpNode, input_sc, input_zp);
const float low = -128, high = 127;
std::vector<float> inpLows(numOutput, low);
std::vector<float> inpHighs(numOutput, high);
std::vector<float> outLows(numOutput);
std::vector<float> outHighs(numOutput);
std::vector<size_t> quantShape(kernel_shape.size(), 1);
if (group != 1)
{
quantShape[0] = group;
quantShape[1] = numOutput / group;
}
else
{
quantShape[0] = numOutput;
}
for (int i = 0; i < numOutput; ++i) {
outLows[i] = low * outputMultiplier[i] * output_sc / input_sc;
outHighs[i] = high * outputMultiplier[i] * output_sc / input_sc;
}
ieWeights = std::make_shared<ngraph::op::Convert>(ieWeights, ngraph::element::f32);
ieWeights = std::make_shared<ngraph::op::FakeQuantize>(ieWeights,
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, inpLows.data()),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, inpHighs.data()),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, outLows.data()),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, outHighs.data()),
256 // levels
);
ngraph::Output<ngraph::Node> conv_node;
if (group != 1) {
conv_node = std::make_shared<ngraph::op::v1::GroupConvolution>(
ieInpNode, ieWeights,
ngraph::Strides(strides),
ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(), pads_end.end())),
ngraph::Strides(dilations),
pad_type);
} else {
conv_node = std::make_shared<ngraph::op::v1::Convolution>(
ieInpNode, ieWeights,
ngraph::Strides(strides),
ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(), pads_end.end())),
ngraph::Strides(dilations),
pad_type);
}
std::vector<size_t> shape(conv_node.get_shape().size(), 1);
shape[1] = conv_node.get_shape()[1];
if (biasvec.size() || nodes.size() == 3)
{
std::shared_ptr<ngraph::Node> bias;
if (nodes.size() == 3)
{
auto bias_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
ngraph::Shape{shape.size()}, std::vector<int64_t>(shape.begin(), shape.end()));
bias = std::make_shared<ngraph::op::v1::Reshape>(nodes[2].dynamicCast<InfEngineNgraphNode>()->node, bias_shape, true);
}
else
{
std::vector<float> ovBias(numOutput);
for (int i = 0; i < numOutput; ++i) {
ovBias[i] = (biasvec[i] + input_zp * cv::sum(blobs[0].row(i))[0]) * outputMultiplier[i] * output_sc;
}
bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), ovBias.data());
}
conv_node = std::make_shared<ngraph::op::v1::Add>(conv_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
}
conv_node = ngraphQuantize(conv_node, output_sc, output_zp);
return new InfEngineNgraphNode(conv_node);
}
#endif // HAVE_DNN_NGRAPH
class ParallelConv : public cv::ParallelLoopBody
{
public:
@ -893,7 +1015,7 @@ public:
outptr[0] = std::min(std::max(out1, -128), 127);
out_j = 1;
}
#if CV_SIMD
#if CV_SIMD128
if( stride_w == 1 )
{
const int out_delta = 16;

View File

@ -5,6 +5,7 @@
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "../op_timvx.hpp"
#include "../ie_ngraph.hpp"
#include <opencv2/dnn/shape_utils.hpp>
#include <iostream>
@ -56,7 +57,7 @@ public:
return tvActType != tvActNotSupported;
}
#endif
return backendId == DNN_BACKEND_OPENCV;
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -244,6 +245,42 @@ public:
return Ptr<BackendNode>();
}
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
input = ngraphDequantize(input, input_sc, input_zp);
ngraph::Output<ngraph::Node> res;
if (type == "ReLU6Int8") {
res = std::make_shared<ngraph::op::Clamp>(input, 0.0f, 6.0f);
} else if (type == "ReLUInt8") {
if (slope) {
auto param = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &slope);
res = std::make_shared<ngraph::op::PRelu>(input, param);
} else {
res = std::make_shared<ngraph::op::Relu>(input);
}
} else if (type == "ELUInt8") {
res = std::make_shared<ngraph::op::Elu>(input, 1.0f);
} else if (type == "MishInt8") {
res = std::make_shared<ngraph::op::v4::Mish>(input);
} else if (type == "AbsValInt8") {
res = std::make_shared<ngraph::op::Abs>(input);
} else if (type == "SigmoidInt8") {
res = std::make_shared<ngraph::op::Sigmoid>(input);
} else {
CV_Error(Error::StsNotImplemented, type + " activation with OpenVINO");
}
res = ngraphQuantize(res, output_sc, output_zp);
return new InfEngineNgraphNode(res);
}
#endif // HAVE_DNN_NGRAPH
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();

View File

@ -5,6 +5,7 @@
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "../op_timvx.hpp"
#include "../ie_ngraph.hpp"
#include <opencv2/dnn/shape_utils.hpp>
namespace cv
@ -138,7 +139,7 @@ public:
// For TimVX Backend, only ELTWISE_CHANNNELS_SAME was supported.
if (backendId == DNN_BACKEND_TIMVX && haveTimVX())
return channelsModeInput == ELTWISE_CHANNNELS_SAME;
return backendId == DNN_BACKEND_OPENCV;
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -369,6 +370,38 @@ public:
return Ptr<BackendNode>();
}
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
CV_Assert(nodes.size() >= 2);
std::vector<ngraph::Output<ngraph::Node>> ieInpNodes(nodes.size());
for (size_t i = 0; i < nodes.size(); i++)
{
ieInpNodes[i] = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
float input_sc = !coeffs.empty() ? coeffs[i] : 1.0f;
float input_zp = op == PROD ? zeropoints[i] : 0.0f;
ieInpNodes[i] = ngraphDequantize(ieInpNodes[i], input_sc, input_zp);
}
auto res = ieInpNodes[0];
for (size_t i = 1; i < ieInpNodes.size(); i++)
{
switch (op) {
case SUM: res = std::make_shared<ngraph::op::v1::Add>(res, ieInpNodes[i]); break;
case PROD: res = std::make_shared<ngraph::op::v1::Multiply>(res, ieInpNodes[i]); break;
case MAX: res = std::make_shared<ngraph::op::v1::Maximum>(res, ieInpNodes[i]); break;
default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
}
}
res = ngraphQuantize(res, 1.0f, offset);
return new InfEngineNgraphNode(res);
}
#endif // HAVE_DNN_NGRAPH
class EltwiseInvoker : public ParallelLoopBody
{
EltwiseLayerInt8Impl& self;

View File

@ -5,6 +5,7 @@
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "../op_timvx.hpp"
#include "../ie_ngraph.hpp"
#include <opencv2/dnn/shape_utils.hpp>
@ -86,7 +87,8 @@ public:
return false;
}
return backendId == DNN_BACKEND_OPENCV;
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
}
virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
@ -303,7 +305,7 @@ public:
#endif
{
int i = 0;
#if CV_SIMD
#if CV_SIMD128
for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
{
v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(),
@ -395,6 +397,77 @@ public:
}
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
CV_CheckTypeEQ(blobs[0].type(), CV_8S, ""); // weights
CV_CheckTypeEQ(blobs[1].type(), CV_32S, ""); // bias
CV_CheckTypeEQ(outputMultiplier.type(), CV_32F, "");
ngraph::Output<ngraph::Node> input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
ngraph::Output<ngraph::Node> ieWeights, ieBias, matmul;
bool transA = false, transB = true;
size_t numOutput = blobs[0].size[0];
if (nodes.size() == 2)
{
CV_Error(Error::StsNotImplemented, "");
// auto inp2 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
// matmul = std::make_shared<ngraph::op::MatMul>(ieInpNode, inp2, transA, transB);
}
else
{
std::vector<int> shape(1 + normalize_axis(axis, input.get_shape().size()), 0);
shape[shape.size() - 1] = -1;
input = std::make_shared<ngraph::op::v1::Reshape>(
input,
std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{shape.size()}, shape.data()),
true
);
input = ngraphDequantize(input, input_sc, input_zp);
const float low = -128, high = 127;
std::vector<float> inpLows(numOutput, low);
std::vector<float> inpHighs(numOutput, high);
std::vector<float> outLows(numOutput);
std::vector<float> outHighs(numOutput);
for (int i = 0; i < numOutput; ++i) {
outLows[i] = low * outputMultiplier.ptr<float>()[i] * output_sc / input_sc;
outHighs[i] = high * outputMultiplier.ptr<float>()[i] * output_sc / input_sc;
}
std::vector<size_t> weight_shape{(size_t)blobs[0].size[0], (size_t)blobs[0].size[1]};
ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::i8, weight_shape, blobs[0].data);
ieWeights = std::make_shared<ngraph::op::Convert>(ieWeights, ngraph::element::f32);
ieWeights = std::make_shared<ngraph::op::FakeQuantize>(ieWeights,
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, inpLows.data()),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, inpHighs.data()),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, outLows.data()),
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, outHighs.data()),
256 // levels
);
matmul = std::make_shared<ngraph::op::MatMul>(input, ieWeights, transA, transB);
}
if (blobs.size() > 1) {
int32_t* bias = blobs[1].ptr<int32_t>();
std::vector<float> ovBias(blobs[1].total());
for (int i = 0; i < ovBias.size(); ++i) {
ovBias[i] = (bias[i] + input_zp * cv::sum(blobs[0].row(i))[0]) * outputMultiplier.ptr<float>()[i] * output_sc;
}
auto bias_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
ngraph::Shape{blobs[1].total()}, ovBias.data());
matmul = std::make_shared<ngraph::op::v1::Add>(matmul, bias_node);
}
matmul = ngraphQuantize(matmul, output_sc, output_zp);
return new InfEngineNgraphNode(matmul);
}
#endif // HAVE_DNN_NGRAPH
Mat weightsMat, biasMat, outputMultiplier, activationLUT;
Ptr<ActivationLayerInt8> activ;
};

View File

@ -5,6 +5,7 @@
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "../op_timvx.hpp"
#include "../ie_ngraph.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include <float.h>
@ -124,6 +125,10 @@ public:
return type == MAX || type == AVE;
return false;
}
else if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
{
return true;
}
return false;
}
@ -271,6 +276,49 @@ public:
return Ptr<BackendNode>();
}
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
input = ngraphDequantize(input, input_sc, input_zp);
ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
if (!padMode.empty())
pad_type = padMode == "VALID" ? ngraph::op::PadType::VALID : ngraph::op::PadType::SAME_UPPER;
auto rounding_type = ceilMode ? ngraph::op::RoundingType::CEIL : ngraph::op::RoundingType::FLOOR;
ngraph::Output<ngraph::Node> pool;
if (type == MAX) {
pool = std::make_shared<ngraph::op::v1::MaxPool>(input, ngraph::Strides(strides),
ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
rounding_type, pad_type);
} else if (type == AVE) {
pool = std::make_shared<ngraph::op::v1::AvgPool>(input, ngraph::Strides(strides),
ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
!avePoolPaddedArea, rounding_type, pad_type);
} else if (type == SUM) {
ngraph::Shape inpShape = input.get_shape();
CV_Assert(inpShape.size() == 2 + kernel_size.size());
std::vector<int64_t> axes;
for (size_t i = 0; i < kernel_size.size(); i++)
{
if (inpShape[2 + i] == kernel_size[i])
axes.push_back(2 + i);
}
auto reduction_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes.size()}, axes);
pool = std::make_shared<ngraph::op::v1::ReduceSum>(input, reduction_axes, true);
} else {
CV_Error(Error::StsNotImplemented, format("INT8 Pooling type: %d", type));
}
pool = ngraphQuantize(pool, output_sc, output_zp);
return new InfEngineNgraphNode(pool);
}
#endif // HAVE_DNN_NGRAPH
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();

View File

@ -5,6 +5,7 @@
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "../op_timvx.hpp"
#include "../ie_ngraph.hpp"
namespace cv
{
@ -98,7 +99,8 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV;
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -171,6 +173,16 @@ public:
else
inputs[0].convertTo(outputs[0], CV_8S, 1.f/scales[0], zeropoints[0]);
}
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
const auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
auto quantized = ngraphQuantize(input, scales[0], zeropoints[0]);
return Ptr<BackendNode>(new InfEngineNgraphNode(quantized));
}
#endif // HAVE_DNN_NGRAPH
};
// Dequantize INT8 Inputs to FP32/FP16
@ -214,7 +226,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV;
return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -285,6 +297,16 @@ public:
else
inputs[0].convertTo(outputs[0], CV_32F, scales[0], -(scales[0]*zeropoints[0]));
}
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
const auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
auto quantized = ngraphDequantize(input, scales[0], zeropoints[0]);
return new InfEngineNgraphNode(quantized);
}
#endif // HAVE_DNN_NGRAPH
};
// Rescale/Requantize INT8 Inputs from (scale1, zeropoint1) to (scale2, zeropoint2)

View File

@ -6,6 +6,7 @@
#include "layers_common.hpp"
#include <opencv2/imgproc.hpp>
#include <opencv2/dnn/shape_utils.hpp>
#include "../ie_ngraph.hpp"
namespace cv
{
@ -72,7 +73,8 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV;
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
}
bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
@ -186,6 +188,59 @@ public:
return flops;
}
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
std::vector<ngraph::Output<ngraph::Node>> ieInpNodes(nodes.size());
for (int i = 0; i < nodes.size(); ++i) {
ieInpNodes[i] = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
}
ieInpNodes[0] = ngraphDequantize(ieInpNodes[0], inp_sc[0], inp_zp[0]);
CV_Assert(!blobs.empty() || ieInpNodes.size() == 1 + (int)hasWeights + (int)hasBias);
ngraph::Output<ngraph::Node> weights, bias;
if (blobs.empty()) {
if (hasWeights)
weights = ieInpNodes[1];
if (hasBias)
bias = ieInpNodes[1 + (int)hasWeights];
} else {
std::vector<size_t> shape = ieInpNodes[0].get_shape();
int cAxis = normalize_axis(axis, shape.size());
size_t numWeights = blobs[0].total();
for (int i = 0; i < cAxis; ++i) {
shape[i] = 1;
}
for (int i = cAxis; i < shape.size(); ++i) {
if (numWeights == 1) {
shape[i] = 1;
}
numWeights = std::max(numWeights / shape[i], (size_t)1);
}
if (hasWeights)
weights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, blobs[0].data);
if (hasBias)
bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, blobs[(int)hasWeights].data);
}
ngraph::Output<ngraph::Node> res = ieInpNodes[0];
if (hasWeights) {
res = std::make_shared<ngraph::op::v1::Multiply>(res, weights);
}
if (hasBias) {
res = std::make_shared<ngraph::op::v1::Add>(res, bias);
}
res = ngraphQuantize(res, output_sc, output_zp);
return new InfEngineNgraphNode(res);
}
#endif // HAVE_DNN_NGRAPH
private:
bool hasWeights;
std::vector<float> inp_sc;

View File

@ -5,6 +5,7 @@
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "../op_timvx.hpp"
#include "../ie_ngraph.hpp"
#include <algorithm>
#include <stdlib.h>
@ -90,7 +91,8 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV ||
(backendId == DNN_BACKEND_TIMVX && haveTimVX());
(backendId == DNN_BACKEND_TIMVX && haveTimVX()) ||
backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
}
virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
@ -194,6 +196,26 @@ public:
return Ptr<BackendNode>();
}
#ifdef HAVE_DNN_NGRAPH
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
input = ngraphDequantize(input, input_sc, input_zp);
ngraph::Output<ngraph::Node> res;
if (logSoftMax) {
res = std::make_shared<ngraph::op::v5::LogSoftmax>(input, axis);
} else {
res = std::make_shared<ngraph::op::v1::Softmax>(input, axis);
}
res = ngraphQuantize(res, output_sc, output_zp);
return new InfEngineNgraphNode(res);
}
#endif // HAVE_DNN_NGRAPH
template <bool with_log>
class SoftmaxInt8Invoker : public ParallelLoopBody {
public:

View File

@ -62,10 +62,15 @@ public:
{
std::vector<UMat> outputs;
outs.getUMatVector(outputs);
if (outs.depth() == CV_16S)
convertFp16(blobs[0], outputs[0]);
if (outs.depth() == CV_16S) {
auto blob = blobs[0];
if (blob.type() != CV_32F) {
blob.convertTo(blob, CV_32F);
}
convertFp16(blob, outputs[0]);
}
else
blobs[0].copyTo(outputs[0]);
blobs[0].convertTo(outputs[0], outputs[0].type());
return true;
}
#endif
@ -80,7 +85,7 @@ public:
std::vector<Mat> outputs;
outputs_arr.getMatVector(outputs);
blobs[0].copyTo(outputs[0]);
blobs[0].convertTo(outputs[0], outputs[0].type());
}
#ifdef HAVE_CANN
@ -123,9 +128,23 @@ public:
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
auto node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
ngraph::element::Type dType;
if (blobs[0].depth() == CV_32F) {
dType = ngraph::element::f32;
} else if (blobs[0].depth() == CV_32S) {
dType = ngraph::element::i32;
} else if (blobs[0].depth() == CV_8S) {
dType = ngraph::element::i8;
} else {
CV_Error(Error::StsNotImplemented, format("Unexpected Const data depth: %d", blobs[0].depth()));
}
std::shared_ptr<ngraph::Node> node =
std::make_shared<ngraph::op::Constant>(dType,
getShape<size_t>(blobs[0]),
blobs[0].data);
if (node->get_element_type() != ngraph::element::f32) {
node = std::make_shared<ngraph::op::Convert>(node, ngraph::element::f32);
}
return Ptr<BackendNode>(new InfEngineNgraphNode(node));
}
#endif // HAVE_DNN_NGRAPH
@ -151,7 +170,11 @@ public:
auto context = reinterpret_cast<csl::CSLContext*>(context_);
CV_Assert(blobs.size() == 1);
return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blobs[0]);
Mat blob = blobs[0];
if (blob.type() != CV_32F) {
blob.convertTo(blob, CV_32F);
}
return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blob);
}
#endif

View File

@ -201,8 +201,6 @@ public:
};
#define IS_POWER_LAYER(layer) \
(!layer.empty() && !layer->type.compare("Power"))
//TODO: simultaneously convolution and bias addition for cache optimization
class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
{

View File

@ -12,16 +12,16 @@
#include <opencv2/core/hal/intrin.hpp>
#include <opencv2/core/utility.hpp> // parallel_for_
#define FAST_GEMM_DEFAULT_STORAGE (1<<20) // 2^20
#define FAST_GEMM_DEFAULT_MAX_STACKBUF (1 << 14)
#define FAST_GEMM_STORAGE (1<<20) // 2^20
#define FAST_GEMM_MAX_STACKBUF (1 << 14)
#define FAST_GEMM_DEFAULT_F32_MC 64
#define FAST_GEMM_DEFAULT_F32_NC 240
#define FAST_GEMM_DEFAULT_F32_MR 8
#define FAST_GEMM_DEFAULT_F32_NR 12
#define FAST_GEMM_DEFAULT_F32_PACKED_STRIDE_K 256
#define FAST_GEMM_F32_MC 64
#define FAST_GEMM_F32_NC 240
#define FAST_GEMM_F32_MR 8
#define FAST_GEMM_F32_NR 12
#define FAST_GEMM_F32_PACKED_STRIDE_K 64
#define FAST_GEMM_DEFAULT_IMPLEMENT_PACK(N, suffix, styp, dtyp) \
#define FAST_GEMM_IMPLEMENT_PACK(N, suffix, styp, dtyp) \
static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
int lda0, int lda1, void* packA_ ) \
{ \
@ -32,47 +32,47 @@ static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
const styp* a_ptr = A + lda0*i; \
for( int j = 0; j < k*lda1; packA += N, j += lda1 ) \
{ \
FAST_GEMM_DEFAULT_LOAD_TO_BUF_##N(styp); \
FAST_GEMM_DEFAULT_PACK##suffix##_##N(buf, packA); \
FAST_GEMM_LOAD_TO_BUF_##N(styp); \
FAST_GEMM_PACK##suffix##_##N(buf, packA); \
} \
} else { \
const styp* a_ptr[N]; \
for (int k = 0; k < N; k++) a_ptr[k] = A + lda0*(i+k < m ? i+k : i); \
for( int j = 0; j < k*lda1; packA += N, j += lda1 ) \
{ \
FAST_GEMM_DEFAULT_LOAD_TO_BUF_BORDERS_##N(styp); \
FAST_GEMM_DEFAULT_PACK##suffix##_##N(buf, packA); \
FAST_GEMM_LOAD_TO_BUF_BORDERS_##N(styp); \
FAST_GEMM_PACK##suffix##_##N(buf, packA); \
} \
} \
} \
}
#define FAST_GEMM_DEFAULT_LOAD_TO_BUF_8(styp) \
#define FAST_GEMM_LOAD_TO_BUF_8(styp) \
styp buf[] = { \
a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
a_ptr[j+lda0*4], a_ptr[j+lda0*5], a_ptr[j+lda0*6], a_ptr[j+lda0*7] }
#define FAST_GEMM_DEFAULT_LOAD_TO_BUF_BORDERS_8(styp) \
#define FAST_GEMM_LOAD_TO_BUF_BORDERS_8(styp) \
styp buf[] = { \
a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j], \
a_ptr[4][j], a_ptr[5][j], a_ptr[6][j], a_ptr[7][j] }
#define FAST_GEMM_DEFAULT_LOAD_TO_BUF_12(styp) \
#define FAST_GEMM_LOAD_TO_BUF_12(styp) \
styp buf[] = { \
a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
a_ptr[j+lda0*4], a_ptr[j+lda0*5], a_ptr[j+lda0*6], a_ptr[j+lda0*7], \
a_ptr[j+lda0*8], a_ptr[j+lda0*9], a_ptr[j+lda0*10], a_ptr[j+lda0*11] }
#define FAST_GEMM_DEFAULT_LOAD_TO_BUF_BORDERS_12(styp) \
#define FAST_GEMM_LOAD_TO_BUF_BORDERS_12(styp) \
styp buf[] = { \
a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j], \
a_ptr[4][j], a_ptr[5][j], a_ptr[6][j], a_ptr[7][j], \
a_ptr[8][j], a_ptr[9][j], a_ptr[10][j], a_ptr[11][j] }
#define FAST_GEMM_DEFAULT_PACK_COPY(src, dst, N) \
#define FAST_GEMM_PACK_COPY(src, dst, N) \
memcpy((dst), (src), N*sizeof(src[0]))
#define FAST_GEMM_DEFAULT_PACK_f32_8(src, dst) FAST_GEMM_DEFAULT_PACK_COPY((src), (dst), 8)
#define FAST_GEMM_DEFAULT_PACK_f32_12(src, dst) FAST_GEMM_DEFAULT_PACK_COPY((src), (dst), 12)
#define FAST_GEMM_PACK_f32_8(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 8)
#define FAST_GEMM_PACK_f32_12(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 12)
namespace cv { namespace dnn { namespace cpu_baseline {
@ -88,20 +88,20 @@ void fastGemmKernel(int M, int N, int K,
float alpha, const char *A, int lda0, int lda1,
const char *packed_B, float beta, char *C, int ldc, int esz);
FAST_GEMM_DEFAULT_IMPLEMENT_PACK(8, _f32, float, float)
FAST_GEMM_DEFAULT_IMPLEMENT_PACK(12, _f32, float, float)
FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float)
FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)
int fastGemmPackBSize(int N, int K) {
int GEMM_NC = FAST_GEMM_DEFAULT_F32_NC, GEMM_NR = FAST_GEMM_DEFAULT_F32_NR;
int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
return static_cast<int>((N + NC - 1) / NC) * NC * K;
}
void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
int GEMM_NC = FAST_GEMM_DEFAULT_F32_NC, GEMM_NR = FAST_GEMM_DEFAULT_F32_NR;
int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
int KC = std::min(FAST_GEMM_DEFAULT_F32_PACKED_STRIDE_K, K);
int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
int n_tiles = (N + NC - 1) / NC;
for (int r = 0; r < n_tiles; ++r) {
@ -116,140 +116,50 @@ void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0,
}
}
#if CV_SIMD128
static void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
char *c_, int ldc, float alpha) {
static inline void fast_gemm_f32(int k, const char *a_, const char *b_,
char *c_, int ldc, float alpha) {
const float* a = (const float*)a_;
const float* b = (const float*)b_;
float* c = (float*)c_;
v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
v_float32x4 s10 = s00, s11 = s00, s12 = s00;
v_float32x4 s20 = s00, s21 = s00, s22 = s00;
v_float32x4 s30 = s00, s31 = s00, s32 = s00;
v_float32x4 s40 = s00, s41 = s00, s42 = s00;
v_float32x4 s50 = s00, s51 = s00, s52 = s00;
v_float32x4 s60 = s00, s61 = s00, s62 = s00;
v_float32x4 s70 = s00, s71 = s00, s72 = s00;
for(int p = 0; p < k; p++, a += FAST_GEMM_DEFAULT_F32_MR, b += FAST_GEMM_DEFAULT_F32_NR) {
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
v_float32x4 a0 = v_setall_f32(*a);
s00 = v_fma(b0, a0, s00);
s01 = v_fma(b1, a0, s01);
s02 = v_fma(b2, a0, s02);
v_float32x4 a1 = v_setall_f32(*(a + 1));
s10 = v_fma(b0, a1, s10);
s11 = v_fma(b1, a1, s11);
s12 = v_fma(b2, a1, s12);
v_float32x4 a2 = v_setall_f32(*(a + 2));
s20 = v_fma(b0, a2, s20);
s21 = v_fma(b1, a2, s21);
s22 = v_fma(b2, a2, s22);
v_float32x4 a3 = v_setall_f32(*(a + 3));
s30 = v_fma(b0, a3, s30);
s31 = v_fma(b1, a3, s31);
s32 = v_fma(b2, a3, s32);
a0 = v_setall_f32(*(a + 4));
s40 = v_fma(b0, a0, s40);
s41 = v_fma(b1, a0, s41);
s42 = v_fma(b2, a0, s42);
a1 = v_setall_f32(*(a + 5));
s50 = v_fma(b0, a1, s50);
s51 = v_fma(b1, a1, s51);
s52 = v_fma(b2, a1, s52);
a2 = v_setall_f32(*(a + 6));
s60 = v_fma(b0, a2, s60);
s61 = v_fma(b1, a2, s61);
s62 = v_fma(b2, a2, s62);
a3 = v_setall_f32(*(a + 7));
s70 = v_fma(b0, a3, s70);
s71 = v_fma(b1, a3, s71);
s72 = v_fma(b2, a3, s72);
}
v_float32x4 c0, c1, c2, c3, c4, c5, v_alpha = v_setall_f32(alpha);
#define FAST_GEMM_FINALE(row0, row1) \
c0 = v_load(c + row0 * ldc); \
c1 = v_load(c + row0 * ldc + 4); \
c2 = v_load(c + row0 * ldc + 8); \
c3 = v_load(c + row1 * ldc); \
c4 = v_load(c + row1 * ldc + 4); \
c5 = v_load(c + row1 * ldc + 8); \
c0 = v_fma(s##row0##0, v_alpha, c0); \
c1 = v_fma(s##row0##1, v_alpha, c1); \
c2 = v_fma(s##row0##2, v_alpha, c2); \
c3 = v_fma(s##row1##0, v_alpha, c3); \
c4 = v_fma(s##row1##1, v_alpha, c4); \
c5 = v_fma(s##row1##2, v_alpha, c5); \
v_store(c + row0 * ldc, c0); \
v_store(c + row0 * ldc + 4, c1); \
v_store(c + row0 * ldc + 8, c2); \
v_store(c + row1 * ldc, c3); \
v_store(c + row1 * ldc + 4, c4); \
v_store(c + row1 * ldc + 8, c5);
FAST_GEMM_FINALE(0, 1);
FAST_GEMM_FINALE(2, 3);
FAST_GEMM_FINALE(4, 5);
FAST_GEMM_FINALE(6, 7);
#undef FAST_GEMM_FINALE
}
#else
static void fast_gemm_f32(int k, const char *a_, const char *b_,
char *c_, int ldc, float alpha) {
const float* a = (const float*)a_;
const float* b = (const float*)b_;
float* c = (float*)c_;
float sbuf[FAST_GEMM_DEFAULT_F32_MR * FAST_GEMM_DEFAULT_F32_NR];
float sbuf[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR];
memset(sbuf, 0, sizeof(sbuf));
for(int p = 0; p < k; p++) {
for( int i = 0; i < FAST_GEMM_DEFAULT_F32_MR; i++ ) {
float ai = a[FAST_GEMM_DEFAULT_F32_MR * p + i];
for( int j = 0; j < FAST_GEMM_DEFAULT_F32_NR; j++ )
sbuf[i * FAST_GEMM_DEFAULT_F32_NR + j] += b[FAST_GEMM_DEFAULT_F32_NR * p + j] * ai;
for( int i = 0; i < FAST_GEMM_F32_MR; i++ ) {
float ai = a[FAST_GEMM_F32_MR * p + i];
for( int j = 0; j < FAST_GEMM_F32_NR; j++ )
sbuf[i * FAST_GEMM_F32_NR + j] += b[FAST_GEMM_F32_NR * p + j] * ai;
}
}
for (int i = 0; i < FAST_GEMM_DEFAULT_F32_MR; i++) {
for (int j = 0; j < FAST_GEMM_DEFAULT_F32_NR; j++)
c[i * ldc + j] += alpha * sbuf[i * FAST_GEMM_DEFAULT_F32_NR + j];
for (int i = 0; i < FAST_GEMM_F32_MR; i++) {
for (int j = 0; j < FAST_GEMM_F32_NR; j++)
c[i * ldc + j] += alpha * sbuf[i * FAST_GEMM_F32_NR + j];
}
}
#endif // CV_SIMD128
static void fast_gemm_macro_kernel(int m, int n, int k,
const char *packed_A, const char *packed_B,
float alpha, char *c, int ldc0, int esz) {
int ldc0_esz = ldc0 * esz;
double tempC[FAST_GEMM_DEFAULT_F32_MR * FAST_GEMM_DEFAULT_F32_NR]; // make sure the buffer is big enough
for(int i = 0; i < m; i += FAST_GEMM_DEFAULT_F32_MR) {
for(int j = 0; j < n; j += FAST_GEMM_DEFAULT_F32_NR) {
double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
for(int i = 0; i < m; i += FAST_GEMM_F32_MR) {
for(int j = 0; j < n; j += FAST_GEMM_F32_NR) {
char* cptr0 = &c[i * ldc0_esz + j * esz];
char* cptr = cptr0;
int ldc = ldc0;
int mr = m - i < FAST_GEMM_DEFAULT_F32_MR ? m - i : FAST_GEMM_DEFAULT_F32_MR;
int nr = n - j < FAST_GEMM_DEFAULT_F32_NR ? n - j : FAST_GEMM_DEFAULT_F32_NR;
int mr = m - i < FAST_GEMM_F32_MR ? m - i : FAST_GEMM_F32_MR;
int nr = n - j < FAST_GEMM_F32_NR ? n - j : FAST_GEMM_F32_NR;
int nr_esz = nr * esz;
bool partial = (bool)((mr < FAST_GEMM_DEFAULT_F32_MR) | (nr < FAST_GEMM_DEFAULT_F32_NR));
bool partial = (bool)((mr < FAST_GEMM_F32_MR) | (nr < FAST_GEMM_F32_NR));
if (partial) {
memset(tempC, 0, sizeof(tempC));
cptr = (char *)tempC;
ldc = FAST_GEMM_DEFAULT_F32_NR;
ldc = FAST_GEMM_F32_NR;
for(int p = 0; p < mr; p++)
memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
}
#if CV_SIMD128
fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
#else
fast_gemm_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
#endif
if (partial) {
for(int p = 0; p < mr; p++)
@ -263,19 +173,19 @@ void fastGemmKernel(int M, int N, int K,
float alpha, const char *A, int lda0, int lda1,
const char *B, int ldb0, int ldb1,
float beta, char *C, int ldc, int esz) {
int GEMM_MC = FAST_GEMM_DEFAULT_F32_MC,
GEMM_NC = FAST_GEMM_DEFAULT_F32_NC,
GEMM_MR = FAST_GEMM_DEFAULT_F32_MR,
GEMM_NR = FAST_GEMM_DEFAULT_F32_NR;
int GEMM_MC = FAST_GEMM_F32_MC,
GEMM_NC = FAST_GEMM_F32_NC,
GEMM_MR = FAST_GEMM_F32_MR,
GEMM_NR = FAST_GEMM_F32_NR;
int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
int KC = FAST_GEMM_DEFAULT_STORAGE / ((MC + NC) * esz);
int KC = FAST_GEMM_STORAGE / ((MC + NC) * esz);
KC = KC > 8 ? KC : 8;
KC = KC < K ? KC : K;
size_t buff_size = KC * (MC + NC) * esz;
bool use_stackbuff = buff_size <= FAST_GEMM_DEFAULT_MAX_STACKBUF;
bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
int m_tiles = (M + MC - 1) / MC;
int n_tiles = (N + NC - 1) / NC;
int total_tiles = m_tiles * n_tiles;
@ -328,17 +238,17 @@ void fastGemmKernel(int M, int N, int K,
void fastGemmKernel(int M, int N, int K,
float alpha, const char *A, int lda0, int lda1,
const char *packed_B, float beta, char *C, int ldc, int esz) {
int GEMM_MC = FAST_GEMM_DEFAULT_F32_MC,
GEMM_NC = FAST_GEMM_DEFAULT_F32_NC,
GEMM_MR = FAST_GEMM_DEFAULT_F32_MR,
GEMM_NR = FAST_GEMM_DEFAULT_F32_NR;
int GEMM_MC = FAST_GEMM_F32_MC,
GEMM_NC = FAST_GEMM_F32_NC,
GEMM_MR = FAST_GEMM_F32_MR,
GEMM_NR = FAST_GEMM_F32_NR;
int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
int KC = std::min(FAST_GEMM_DEFAULT_F32_PACKED_STRIDE_K, K);
int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
size_t buff_size = KC * MC * esz;
bool use_stackbuff = buff_size <= FAST_GEMM_DEFAULT_MAX_STACKBUF;
bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
int m_tiles = (M + MC - 1) / MC;
int n_tiles = (N + NC - 1) / NC;
int total_tiles = m_tiles * n_tiles;
@ -391,3 +301,29 @@ void fastGemmKernel(int M, int N, int K,
}
}}} // cv::dnn::cpu_baseline
#undef FAST_GEMM_STORAGE
#undef FAST_GEMM_MAX_STACKBUF
#ifdef FAST_GEMM_F32_MC
#undef FAST_GEMM_F32_MC
#endif
#ifdef FAST_GEMM_F32_NC
#undef FAST_GEMM_F32_NC
#endif
#ifdef FAST_GEMM_F32_MR
#undef FAST_GEMM_F32_MR
#endif
#ifdef FAST_GEMM_F32_NR
#undef FAST_GEMM_F32_NR
#endif
#ifdef FAST_GEMM_F32_PACKED_STRIDE_K
#undef FAST_GEMM_F32_PACKED_STRIDE_K
#endif
#undef FAST_GEMM_IMPLEMENT_PACK
#undef FAST_GEMM_LOAD_TO_BUF_8
#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_8
#undef FAST_GEMM_LOAD_TO_BUF_12
#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_12
#undef FAST_GEMM_PACK_COPY
#undef FAST_GEMM_PACK_f32_8
#undef FAST_GEMM_PACK_f32_12

View File

@ -15,37 +15,31 @@
#define FAST_GEMM_STORAGE (1<<20) // 2^20
#define FAST_GEMM_MAX_STACKBUF (1 << 14)
#if CV_NEON
#define FAST_GEMM_F32_MC 64
#define FAST_GEMM_F32_NC 240
#elif CV_AVX
#if CV_AVX
#define FAST_GEMM_F32_MC 60
#define FAST_GEMM_F32_NC 320
#elif CV_LASX
#define FAST_GEMM_F32_MC 48
#define FAST_GEMM_F32_NC 128
#else // CV_NEON_AARCH64, SIMD128
#define FAST_GEMM_F32_MC 64
#define FAST_GEMM_F32_NC 240
#endif
// micro kernel size
#if CV_NEON && CV_NEON_AARCH64
#define FAST_GEMM_F32_MR 8
#define FAST_GEMM_F32_NR 12
#elif CV_NEON
#define FAST_GEMM_F32_MR 4
#define FAST_GEMM_F32_NR 12
#elif CV_AVX
#if CV_AVX
#define FAST_GEMM_F32_MR 12
#define FAST_GEMM_F32_NR 8
#elif CV_LASX
#define FAST_GEMM_F32_MR 12
#define FAST_GEMM_F32_NR 16
#else // CV_NEON_AARCH64, CV_SIMD128
#define FAST_GEMM_F32_MR 8
#define FAST_GEMM_F32_NR 12
#endif
#if CV_NEON
#define FAST_GEMM_F32_PACKED_STRIDE_K 64
#elif CV_AVX
#if CV_AVX
#define FAST_GEMM_F32_PACKED_STRIDE_K 128
#elif CV_LASX
#else // CV_LASX, CV_NEON_AARCH64, CV_SIMD128
#define FAST_GEMM_F32_PACKED_STRIDE_K 64
#endif
@ -75,14 +69,6 @@ static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
} \
}
#define FAST_GEMM_LOAD_TO_BUF_4(styp) \
styp buf[] = { \
a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3] }
#define FAST_GEMM_LOAD_TO_BUF_BORDERS_4(styp) \
styp buf[] = { \
a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j] }
#define FAST_GEMM_LOAD_TO_BUF_8(styp) \
styp buf[] = { \
a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
@ -121,7 +107,6 @@ static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
#define FAST_GEMM_PACK_COPY(src, dst, N) \
memcpy((dst), (src), N*sizeof(src[0]))
#define FAST_GEMM_PACK_f32_4(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 4)
#define FAST_GEMM_PACK_f32_8(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 8)
#define FAST_GEMM_PACK_f32_12(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 12)
#define FAST_GEMM_PACK_f32_16(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 16)
@ -130,7 +115,6 @@ namespace cv { namespace dnn {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
// TODO: type to size_t
int fastGemmPackBSize(int N, int K);
void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz);
@ -143,44 +127,18 @@ void fastGemmKernel(int M, int N, int K,
float alpha, const char *A, int lda0, int lda1,
const char *packed_B, float beta, char *C, int ldc, int esz);
// NEON (AARCH64: 32 x 128-bit registers, armv7: 16 x 128-bit registers)
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#if CV_NEON_AARCH64
FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float)
#else
FAST_GEMM_IMPLEMENT_PACK(4, _f32, float, float)
#endif
FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)
/*
Compute kernels that optimized for different platforms
*/
#if CV_NEON && CV_NEON_AARCH64 // AARCH64: 32 x 128-bit registers
int fastGemmPackBSize(int N, int K) {
int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) // a packer
FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // b packer
return static_cast<int>((N + NC - 1) / NC) * NC * K;
}
void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
int n_tiles = (N + NC - 1) / NC;
for (int r = 0; r < n_tiles; ++r) {
int j0 = r * NC;
int nc = N - j0 < NC ? N - j0 : NC;
int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
for (int k = 0; k < K; k += KC) {
int kc = K - k < KC ? K - k : KC;
fast_gemm_pack12_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
packed_B += _nc * kc;
}
}
}
#if CV_NEON_AARCH64
static void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
char *c_, int ldc, float alpha) {
static inline void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
char *c_, int ldc, float alpha) {
const float* a = (const float*)a_;
const float* b = (const float*)b_;
float* c = (float*)c_;
@ -258,278 +216,17 @@ static void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
#undef FAST_GEMM_FINALE
}
#else // CV_NEON_AARCH64
static void fast_gemm4x12_f32(int k, const char *a_, const char *b_,
char *c_, int ldc, float alpha) {
const float* a = (const float*)a_;
const float* b = (const float*)b_;
float* c = (float*)c_;
#elif CV_AVX // AVX and AVX2 (16 x 256-bit registers)
float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00,
s10 = s00, s11 = s00, s12 = s00,
s20 = s00, s21 = s00, s22 = s00,
s30 = s00, s31 = s00, s32 = s00;
for(int p = 0; p < k; p++, a += FAST_GEMM_F32_MR, b += FAST_GEMM_F32_NR)
{
float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
float32x4_t a0 = vld1q_dup_f32(a);
s00 = vmlaq_f32(a0, b0, s00);
s01 = vmlaq_f32(a0, b1, s01);
s02 = vmlaq_f32(a0, b2, s02);
a0 = vld1q_dup_f32(a + 1);
s10 = vmlaq_f32(a0, b0, s10);
s11 = vmlaq_f32(a0, b1, s11);
s12 = vmlaq_f32(a0, b2, s12);
a0 = vld1q_dup_f32(a + 2);
s20 = vmlaq_f32(a0, b0, s20);
s21 = vmlaq_f32(a0, b1, s21);
s22 = vmlaq_f32(a0, b2, s22);
a0 = vld1q_dup_f32(a + 3);
s30 = vmlaq_f32(a0, b0, s30);
s31 = vmlaq_f32(a0, b1, s31);
s32 = vmlaq_f32(a0, b2, s32);
}
float32x4_t c0, c1, c2, v_alpha = vdupq_n_f32(alpha);
#define FAST_GEMM_FINALE(row0) \
c0 = vld1q_f32(c + row0 * ldc); \
c1 = vld1q_f32(c + row0 * ldc + 4); \
c2 = vld1q_f32(c + row0 * ldc + 8); \
c0 = vmlaq_f32(c0, s##row0##0, v_alpha); \
c1 = vmlaq_f32(c1, s##row0##1, v_alpha); \
c2 = vmlaq_f32(c2, s##row0##2, v_alpha); \
vst1q_f32(c + row0 * ldc, c0); \
vst1q_f32(c + row0 * ldc + 4, c1); \
vst1q_f32(c + row0 * ldc + 8, c2);
FAST_GEMM_FINALE(0);
FAST_GEMM_FINALE(1);
FAST_GEMM_FINALE(2);
FAST_GEMM_FINALE(3);
#undef FAST_GEMM_FINALE
}
#endif // micro kernel CV_NEON_AARCH64
static void fast_gemm_macro_kernel(int m, int n, int k,
const char *packed_A, const char *packed_B,
float alpha, char *c, int ldc0, int esz) {
int ldc0_esz = ldc0 * esz;
double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
for(int i = 0; i < m; i += FAST_GEMM_F32_MR) {
for(int j = 0; j < n; j += FAST_GEMM_F32_NR) {
char* cptr0 = &c[i * ldc0_esz + j * esz];
char* cptr = cptr0;
int ldc = ldc0;
int mr = m - i < FAST_GEMM_F32_MR ? m - i : FAST_GEMM_F32_MR;
int nr = n - j < FAST_GEMM_F32_NR ? n - j : FAST_GEMM_F32_NR;
int nr_esz = nr * esz;
bool partial = (bool)((mr < FAST_GEMM_F32_MR) | (nr < FAST_GEMM_F32_NR));
if (partial) {
memset(tempC, 0, sizeof(tempC));
cptr = (char *)tempC;
ldc = FAST_GEMM_F32_NR;
for(int p = 0; p < mr; p++)
memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
}
#if CV_NEON_AARCH64
fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
#else
fast_gemm4x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
#endif
if (partial) {
for(int p = 0; p < mr; p++)
memcpy(cptr0 + p * ldc0_esz, cptr + p * (ldc * esz), nr_esz);
}
}
}
}
void fastGemmKernel(int M, int N, int K,
float alpha, const char *A, int lda0, int lda1,
const char *B, int ldb0, int ldb1,
float beta, char *C, int ldc, int esz) {
int GEMM_MC = FAST_GEMM_F32_MC,
GEMM_NC = FAST_GEMM_F32_NC,
GEMM_MR = FAST_GEMM_F32_MR,
GEMM_NR = FAST_GEMM_F32_NR;
int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
int KC = FAST_GEMM_STORAGE / ((MC + NC) * esz);
KC = KC > 8 ? KC : 8;
KC = KC < K ? KC : K;
size_t buff_size = KC * (MC + NC) * esz;
bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
int m_tiles = (M + MC - 1) / MC;
int n_tiles = (N + NC - 1) / NC;
int total_tiles = m_tiles * n_tiles;
auto fn = [&](const Range &r) {
char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size));
char* packed_b = packed_a + KC * MC * esz;
int start = r.start;
int end = r.end;
for (int tile_idx = start; tile_idx < end; tile_idx++) {
int i0 = (tile_idx / n_tiles) * MC;
int j0 = (tile_idx % n_tiles) * NC;
int mc = M - i0 < MC ? M - i0 : MC;
int nc = N - j0 < NC ? N - j0 : NC;
int ldc_block = ldc;
char* c_block = C + (i0 * ldc + j0) * esz;
if (beta == 0.f) {
for(int i = 0; i < mc; i++)
memset(c_block + i * ldc_block * esz, 0, nc * esz);
} else if (beta != 1.f) {
for(int i = 0; i < mc; i++) {
float* c_i = (float*)c_block + i * ldc_block;
for(int j = 0; j < nc; j++)
c_i[j] *= beta;
}
}
for(int k0 = 0; k0 < K; k0 += KC)
{
int kc = K - k0 < KC ? K - k0 : KC;
#if CV_NEON_AARCH64
fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
#else
fast_gemm_pack4_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
#endif
fast_gemm_pack12_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
}
}
if (!use_stackbuff) {
free(packed_a);
}
};
int total = total_tiles;
int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
parallel_for_(Range(0, total), fn, nstripes);
}
void fastGemmKernel(int M, int N, int K,
float alpha, const char *A, int lda0, int lda1,
const char *packed_B, float beta, char *C, int ldc, int esz) {
int GEMM_MC = FAST_GEMM_F32_MC,
GEMM_NC = FAST_GEMM_F32_NC,
GEMM_MR = FAST_GEMM_F32_MR,
GEMM_NR = FAST_GEMM_F32_NR;
int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
size_t buff_size = KC * MC * esz;
bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
int m_tiles = (M + MC - 1) / MC;
int n_tiles = (N + NC - 1) / NC;
int total_tiles = m_tiles * n_tiles;
auto fn = [&](const Range &r) {
char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size)); // TODO: use AutoBuffer
const char *packed_b_ = packed_B;
int start = r.start;
int end = r.end;
for (int tile_idx = start; tile_idx < end; tile_idx++) {
int i0 = (tile_idx / n_tiles) * MC;
int j0 = (tile_idx % n_tiles) * NC;
int mc = M - i0 < MC ? M - i0 : MC;
int nc = N - j0 < NC ? N - j0 : NC;
int ldc_block = ldc;
char* c_block = C + (i0 * ldc + j0) * esz;
packed_b_ = packed_B + j0 * K * esz;
if (beta == 0.f) {
for(int i = 0; i < mc; i++)
memset(c_block + i * ldc_block * esz, 0, nc * esz);
} else if (beta != 1.f) {
for(int i = 0; i < mc; i++) {
float* c_i = (float*)c_block + i * ldc_block;
for(int j = 0; j < nc; j++)
c_i[j] *= beta;
}
}
int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
for(int k0 = 0; k0 < K; k0 += KC)
{
int kc = K - k0 < KC ? K - k0 : KC;
#if CV_NEON_AARCH64
fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
#else
fast_gemm_pack4_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
#endif
fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b_, alpha, c_block, ldc_block, esz);
packed_b_ += _nc * kc;
}
}
if (!use_stackbuff) {
free(packed_a);
}
};
int total = total_tiles;
int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
parallel_for_(Range(0, total), fn, nstripes);
}
#endif // CV_NEON, CV_NEON_AARCH64
// AVX and AVX2 (16 x 256-bit registers)
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float)
FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)
int fastGemmPackBSize(int N, int K) {
int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
return static_cast<int>((N + NC - 1) / NC) * NC * K;
}
void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
int n_tiles = (N + NC - 1) / NC;
for (int r = 0; r < n_tiles; ++r) {
int j0 = r * NC;
int nc = N - j0 < NC ? N - j0 : NC;
int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
for (int k = 0; k < K; k += KC) {
int kc = K - k < KC ? K - k : KC;
fast_gemm_pack8_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
packed_B += _nc * kc;
}
}
}
FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) // a packer
FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // b packer
#if !CV_FMA3 // AVX workaround for FMA
#undef _mm256_fmadd_ps
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
#endif
static void fast_gemm12x8_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
static inline void fast_gemm12x8_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
const float* a = (const float*)a_;
const float* b = (const float*)b_;
float* c = (float*)c_;
@ -599,203 +296,12 @@ static void fast_gemm12x8_f32(int k, const char *a_, const char *b_, char *c_, i
#undef FAST_GEMM_FINALE
}
static void fast_gemm_macro_kernel(int m, int n, int k,
const char *packed_A, const char *packed_B,
float alpha, char *c, int ldc0, int esz) {
int ldc0_esz = ldc0 * esz;
#elif CV_LASX // LASX (32 x 256-bit registers)
double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
for(int i = 0; i < m; i += FAST_GEMM_F32_MR) {
for(int j = 0; j < n; j += FAST_GEMM_F32_NR) {
char* cptr0 = &c[i * ldc0_esz + j * esz];
char* cptr = cptr0;
int ldc = ldc0;
int mr = m - i < FAST_GEMM_F32_MR ? m - i : FAST_GEMM_F32_MR;
int nr = n - j < FAST_GEMM_F32_NR ? n - j : FAST_GEMM_F32_NR;
int nr_esz = nr * esz;
bool partial = (bool)((mr < FAST_GEMM_F32_MR) | (nr < FAST_GEMM_F32_NR));
if (partial) {
memset(tempC, 0, sizeof(tempC));
cptr = (char *)tempC;
ldc = FAST_GEMM_F32_NR;
for(int p = 0; p < mr; p++)
memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
}
fast_gemm12x8_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // a packer
FAST_GEMM_IMPLEMENT_PACK(16, _f32, float, float) // b packer
if (partial) {
for(int p = 0; p < mr; p++)
memcpy(cptr0 + p * ldc0_esz, cptr + p * (ldc * esz), nr_esz);
}
}
}
}
void fastGemmKernel(int M, int N, int K,
float alpha, const char *A, int lda0, int lda1,
const char *B, int ldb0, int ldb1,
float beta, char *C, int ldc, int esz) {
int GEMM_MC = FAST_GEMM_F32_MC,
GEMM_NC = FAST_GEMM_F32_NC,
GEMM_MR = FAST_GEMM_F32_MR,
GEMM_NR = FAST_GEMM_F32_NR;
int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
int KC = FAST_GEMM_STORAGE / ((MC + NC) * esz);
KC = KC > 8 ? KC : 8;
KC = KC < K ? KC : K;
size_t buff_size = KC * (MC + NC) * esz;
bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
int m_tiles = (M + MC - 1) / MC;
int n_tiles = (N + NC - 1) / NC;
int total_tiles = m_tiles * n_tiles;
auto fn = [&](const Range &r) {
char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size));
char* packed_b = packed_a + KC * MC * esz;
int start = r.start;
int end = r.end;
for (int tile_idx = start; tile_idx < end; tile_idx++) {
int i0 = (tile_idx / n_tiles) * MC;
int j0 = (tile_idx % n_tiles) * NC;
int mc = M - i0 < MC ? M - i0 : MC;
int nc = N - j0 < NC ? N - j0 : NC;
int ldc_block = ldc;
char* c_block = C + (i0 * ldc + j0) * esz;
if (beta == 0.f) {
for(int i = 0; i < mc; i++)
memset(c_block + i * ldc_block * esz, 0, nc * esz);
} else if (beta != 1.f) {
for(int i = 0; i < mc; i++) {
float* c_i = (float*)c_block + i * ldc_block;
for(int j = 0; j < nc; j++)
c_i[j] *= beta;
}
}
for(int k0 = 0; k0 < K; k0 += KC)
{
int kc = K - k0 < KC ? K - k0 : KC;
fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
fast_gemm_pack8_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
}
}
if (!use_stackbuff) {
free(packed_a);
}
};
int total = total_tiles;
int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
parallel_for_(Range(0, total), fn, nstripes);
}
void fastGemmKernel(int M, int N, int K,
float alpha, const char *A, int lda0, int lda1,
const char *packed_B, float beta, char *C, int ldc, int esz) {
int GEMM_MC = FAST_GEMM_F32_MC,
GEMM_NC = FAST_GEMM_F32_NC,
GEMM_MR = FAST_GEMM_F32_MR,
GEMM_NR = FAST_GEMM_F32_NR;
int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
size_t buff_size = KC * MC * esz;
bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
int m_tiles = (M + MC - 1) / MC;
int n_tiles = (N + NC - 1) / NC;
int total_tiles = m_tiles * n_tiles;
auto fn = [&](const Range &r) {
char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size)); // TODO: use AutoBuffer
const char *packed_b_ = packed_B;
int start = r.start;
int end = r.end;
for (int tile_idx = start; tile_idx < end; tile_idx++) {
int i0 = (tile_idx / n_tiles) * MC;
int j0 = (tile_idx % n_tiles) * NC;
int mc = M - i0 < MC ? M - i0 : MC;
int nc = N - j0 < NC ? N - j0 : NC;
int ldc_block = ldc;
char* c_block = C + (i0 * ldc + j0) * esz;
packed_b_ = packed_B + j0 * K * esz;
if (beta == 0.f) {
for(int i = 0; i < mc; i++)
memset(c_block + i * ldc_block * esz, 0, nc * esz);
} else if (beta != 1.f) {
for(int i = 0; i < mc; i++) {
float* c_i = (float*)c_block + i * ldc_block;
for(int j = 0; j < nc; j++)
c_i[j] *= beta;
}
}
int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
for(int k0 = 0; k0 < K; k0 += KC)
{
int kc = K - k0 < KC ? K - k0 : KC;
fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b_, alpha, c_block, ldc_block, esz);
packed_b_ += _nc * kc;
}
}
if (!use_stackbuff) {
free(packed_a);
}
};
int total = total_tiles;
int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
parallel_for_(Range(0, total), fn, nstripes);
}
#endif // CV_AVX, CV_AVX2
// LASX (32 x 256-bit registers)
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)
FAST_GEMM_IMPLEMENT_PACK(16, _f32, float, float)
int fastGemmPackBSize(int N, int K) {
int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
return static_cast<int>((N + NC - 1) / NC) * NC * K;
}
void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
int n_tiles = (N + NC - 1) / NC;
for (int r = 0; r < n_tiles; ++r) {
int j0 = r * NC;
int nc = N - j0 < NC ? N - j0 : NC;
int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
for (int k = 0; k < K; k += KC) {
int kc = K - k < KC ? K - k : KC;
fast_gemm_pack16_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
packed_B += _nc * kc;
}
}
}
static void fast_gemm12x16_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
static inline void fast_gemm12x16_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
const float* a = (const float*)a_;
const float* b = (const float*)b_;
float* c = (float*)c_;
@ -889,9 +395,99 @@ static void fast_gemm12x16_f32(int k, const char *a_, const char *b_, char *c_,
#undef FAST_GEMM_FINALE
}
static void fast_gemm_macro_kernel(int m, int n, int k,
const char *packed_A, const char *packed_B,
float alpha, char *c, int ldc0, int esz) {
#elif CV_SIMD128 // armv7: 16 x 128-bit registers
FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) // a packer
FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // b packer
static inline void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
char *c_, int ldc, float alpha) {
const float* a = (const float*)a_;
const float* b = (const float*)b_;
float* c = (float*)c_;
v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
v_float32x4 s10 = s00, s11 = s00, s12 = s00;
v_float32x4 s20 = s00, s21 = s00, s22 = s00;
v_float32x4 s30 = s00, s31 = s00, s32 = s00;
v_float32x4 s40 = s00, s41 = s00, s42 = s00;
v_float32x4 s50 = s00, s51 = s00, s52 = s00;
v_float32x4 s60 = s00, s61 = s00, s62 = s00;
v_float32x4 s70 = s00, s71 = s00, s72 = s00;
for(int p = 0; p < k; p++, a += FAST_GEMM_F32_MR, b += FAST_GEMM_F32_NR) {
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
v_float32x4 a0 = v_setall_f32(*a);
s00 = v_fma(b0, a0, s00);
s01 = v_fma(b1, a0, s01);
s02 = v_fma(b2, a0, s02);
v_float32x4 a1 = v_setall_f32(*(a + 1));
s10 = v_fma(b0, a1, s10);
s11 = v_fma(b1, a1, s11);
s12 = v_fma(b2, a1, s12);
v_float32x4 a2 = v_setall_f32(*(a + 2));
s20 = v_fma(b0, a2, s20);
s21 = v_fma(b1, a2, s21);
s22 = v_fma(b2, a2, s22);
v_float32x4 a3 = v_setall_f32(*(a + 3));
s30 = v_fma(b0, a3, s30);
s31 = v_fma(b1, a3, s31);
s32 = v_fma(b2, a3, s32);
a0 = v_setall_f32(*(a + 4));
s40 = v_fma(b0, a0, s40);
s41 = v_fma(b1, a0, s41);
s42 = v_fma(b2, a0, s42);
a1 = v_setall_f32(*(a + 5));
s50 = v_fma(b0, a1, s50);
s51 = v_fma(b1, a1, s51);
s52 = v_fma(b2, a1, s52);
a2 = v_setall_f32(*(a + 6));
s60 = v_fma(b0, a2, s60);
s61 = v_fma(b1, a2, s61);
s62 = v_fma(b2, a2, s62);
a3 = v_setall_f32(*(a + 7));
s70 = v_fma(b0, a3, s70);
s71 = v_fma(b1, a3, s71);
s72 = v_fma(b2, a3, s72);
}
v_float32x4 c0, c1, c2, c3, c4, c5, v_alpha = v_setall_f32(alpha);
#define FAST_GEMM_FINALE(row0, row1) \
c0 = v_load(c + row0 * ldc); \
c1 = v_load(c + row0 * ldc + 4); \
c2 = v_load(c + row0 * ldc + 8); \
c3 = v_load(c + row1 * ldc); \
c4 = v_load(c + row1 * ldc + 4); \
c5 = v_load(c + row1 * ldc + 8); \
c0 = v_fma(s##row0##0, v_alpha, c0); \
c1 = v_fma(s##row0##1, v_alpha, c1); \
c2 = v_fma(s##row0##2, v_alpha, c2); \
c3 = v_fma(s##row1##0, v_alpha, c3); \
c4 = v_fma(s##row1##1, v_alpha, c4); \
c5 = v_fma(s##row1##2, v_alpha, c5); \
v_store(c + row0 * ldc, c0); \
v_store(c + row0 * ldc + 4, c1); \
v_store(c + row0 * ldc + 8, c2); \
v_store(c + row1 * ldc, c3); \
v_store(c + row1 * ldc + 4, c4); \
v_store(c + row1 * ldc + 8, c5);
FAST_GEMM_FINALE(0, 1);
FAST_GEMM_FINALE(2, 3);
FAST_GEMM_FINALE(4, 5);
FAST_GEMM_FINALE(6, 7);
#undef FAST_GEMM_FINALE
}
#endif
static inline void fast_gemm_macro_kernel(int m, int n, int k,
const char *packed_A, const char *packed_B,
float alpha, char *c, int ldc0, int esz) {
int ldc0_esz = ldc0 * esz;
double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
@ -911,7 +507,15 @@ static void fast_gemm_macro_kernel(int m, int n, int k,
for(int p = 0; p < mr; p++)
memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
}
#if CV_NEON && CV_NEON_AARCH64
fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
#elif CV_AVX
fast_gemm12x8_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
#elif CV_LASX
fast_gemm12x16_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
#elif CV_SIMD128
fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
#endif
if (partial) {
for(int p = 0; p < mr; p++)
@ -921,6 +525,39 @@ static void fast_gemm_macro_kernel(int m, int n, int k,
}
}
int fastGemmPackBSize(int N, int K) {
int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
return static_cast<int>((N + NC - 1) / NC) * NC * K;
}
void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
int n_tiles = (N + NC - 1) / NC;
for (int r = 0; r < n_tiles; ++r) {
int j0 = r * NC;
int nc = N - j0 < NC ? N - j0 : NC;
int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
for (int k = 0; k < K; k += KC) {
int kc = K - k < KC ? K - k : KC;
#if CV_NEON && CV_NEON_AARCH64
fast_gemm_pack12_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
#elif CV_AVX
fast_gemm_pack8_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
#elif CV_LASX
fast_gemm_pack16_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
#elif CV_SIMD128
fast_gemm_pack12_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
#endif
packed_B += _nc * kc;
}
}
}
void fastGemmKernel(int M, int N, int K,
float alpha, const char *A, int lda0, int lda1,
const char *B, int ldb0, int ldb1,
@ -970,8 +607,29 @@ void fastGemmKernel(int M, int N, int K,
for(int k0 = 0; k0 < K; k0 += KC)
{
int kc = K - k0 < KC ? K - k0 : KC;
// pack a
#if CV_NEON && CV_NEON_AARCH64
fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
#elif CV_AVX
fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
#elif CV_LASX
fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
#elif CV_SIMD128
fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
#endif
// pack b
#if CV_NEON && CV_NEON_AARCH64
fast_gemm_pack12_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
#elif CV_AVX
fast_gemm_pack8_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
#elif CV_LASX
fast_gemm_pack16_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
#elif CV_SIMD128
fast_gemm_pack12_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
#endif
// run kernel
fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
}
}
@ -1035,7 +693,18 @@ void fastGemmKernel(int M, int N, int K,
for(int k0 = 0; k0 < K; k0 += KC)
{
int kc = K - k0 < KC ? K - k0 : KC;
// pack a
#if CV_NEON && CV_NEON_AARCH64
fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
#elif CV_AVX
fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
#elif CV_LASX
fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
#elif CV_SIMD128
fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
#endif
// run kernel
fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b_, alpha, c_block, ldc_block, esz);
packed_b_ += _nc * kc;
}
@ -1052,8 +721,37 @@ void fastGemmKernel(int M, int N, int K,
parallel_for_(Range(0, total), fn, nstripes);
}
#endif // CV_LASX
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END
}} // cv::dnn
#undef FAST_GEMM_STORAGE
#undef FAST_GEMM_MAX_STACKBUF
#ifdef FAST_GEMM_F32_MC
#undef FAST_GEMM_F32_MC
#endif
#ifdef FAST_GEMM_F32_NC
#undef FAST_GEMM_F32_NC
#endif
#ifdef FAST_GEMM_F32_MR
#undef FAST_GEMM_F32_MR
#endif
#ifdef FAST_GEMM_F32_NR
#undef FAST_GEMM_F32_NR
#endif
#ifdef FAST_GEMM_F32_PACKED_STRIDE_K
#undef FAST_GEMM_F32_PACKED_STRIDE_K
#endif
#undef FAST_GEMM_IMPLEMENT_PACK
#undef FAST_GEMM_LOAD_TO_BUF_8
#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_8
#undef FAST_GEMM_LOAD_TO_BUF_12
#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_12
#undef FAST_GEMM_LOAD_TO_BUF_16
#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_16
#undef FAST_GEMM_PACK_COPY
#undef FAST_GEMM_PACK_f32_8
#undef FAST_GEMM_PACK_f32_12
#undef FAST_GEMM_PACK_f32_16

View File

@ -47,73 +47,76 @@ public:
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
// Get x tensor.
const auto &src_mat = inputs[0];
const auto *src_ptr = src_mat.ptr<float>();
// Get input tensor.
const auto& src_mat = inputs[0];
const auto* src_ptr = src_mat.ptr<float>();
// Get axis.
const int axis = normalize_axis(axis_raw, src_mat.dims);
// Get target axis.
int axis = inputs.size() > 1 ? parseAxis(inputs[1]) : axis_raw;
axis = normalize_axis(axis, src_mat.dims);
// Get y tensor.
auto &dst_mat = outputs[0];
src_mat.copyTo(dst_mat);
auto *dst_ptr = dst_mat.ptr<float>();
// Get output tensor.
auto& dst_mat = outputs[0];
auto* dst_ptr = dst_mat.ptr<float>();
// Get flags.
const auto exclusive = exclusive_raw == 1;
const auto reverse = reverse_raw == 1;
// Get parameters to iterate outer dimension.
// Data with [dim_1, .. , dim_k-1, target_dim, dim_k+1, .. , dim_n]
// dimensions is represented here as [outer_dim, target_dim, inner_dim]
const size_t outer_size = src_mat.total(0, axis);
const size_t outer_step_length = src_mat.total(axis);
const size_t target_size = src_mat.size[axis];
const size_t inner_size = src_mat.total(axis + 1);
const size_t outer_step_length = target_size * inner_size;
// Get parameters to iterate inner dimension.
const size_t inner_size = src_mat.size[axis];
// Calculating steps in target dimensions
const int target_start = reverse ? target_size - 1 : 0;
const int target_stop = reverse ? -1 : target_size;
const int target_delta = reverse ? -1 : 1;
const int target_step = target_delta * inner_size;
if (!inner_size)
return;
// If exclusive, the j-th output element would be the sum of the first (j-1) elements.
// Otherwise, it would be the sum of the first j elements.
const int exclusive_delta = exclusive ? target_step : 0;
const size_t inner_step_length = src_mat.total(axis + 1);
const int inner_step = (reverse ? -1 : 1) * inner_step_length;
const int inner_start = reverse ? inner_size - 1 : 0;
const int inner_stop = reverse ? -1 : inner_size;
const int inner_delta = reverse ? -1 : 1;
// Get parameters to populate channels.
const size_t num_channels = src_mat.total(axis + 1);
for (size_t outer_dim = 0; outer_dim < outer_size; outer_dim++)
for (size_t outer_idx = 0; outer_idx < outer_size; outer_idx++)
{
const size_t outer_offset = outer_dim * outer_step_length;
size_t src_offset = outer_offset + inner_start * inner_step_length;
const size_t target_offset = outer_idx * outer_step_length;
// Populate first element of inner dimension.
for (size_t channel = 0; channel < num_channels; channel++)
// Handle first element of target dimension.
size_t first_inner_offset = target_offset + target_start * inner_size;
if (exclusive)
for (size_t inner_idx = 0; inner_idx < inner_size; inner_idx++)
dst_ptr[first_inner_offset + inner_idx] = 0.0f;
else
for (size_t inner_idx = 0; inner_idx < inner_size; inner_idx++)
dst_ptr[first_inner_offset + inner_idx] = src_ptr[first_inner_offset + inner_idx];
// Handle remaining elements of target dimension.
for (int target_idx = target_start + target_delta; target_idx != target_stop; target_idx += target_delta)
{
if (exclusive)
const size_t inner_offset = target_offset + target_idx * inner_size;
for (size_t inner_idx = 0; inner_idx < inner_size; inner_idx++)
{
dst_ptr[src_offset + channel] = 0.0f;
}
else
{
dst_ptr[src_offset + channel] = src_ptr[src_offset + channel];
src_offset += inner_step;
dst_ptr[inner_offset + inner_idx] = dst_ptr[inner_offset - target_step + inner_idx] +
src_ptr[inner_offset - exclusive_delta + inner_idx];
}
}
}
}
// Populate remaining elements of inner dimension.
for (int inner_dim = inner_start + inner_delta; inner_dim != inner_stop; inner_dim += inner_delta)
{
const size_t dst_offset = outer_offset + inner_dim * inner_step_length;
for (size_t channel = 0; channel < num_channels; channel++)
{
const size_t previous_dst_offset = dst_offset - inner_step;
dst_ptr[dst_offset + channel] = dst_ptr[previous_dst_offset + channel] +
src_ptr[src_offset + channel];
src_offset += inner_step;
}
}
int parseAxis(const Mat& axis_mat) {
CV_CheckEQ(axis_mat.total(), 1u, "Axis tensor should contain single value");
if (axis_mat.type() == CV_32SC1)
return axis_mat.at<int32_t>(0);
else
{
Mat axis_mat_int;
axis_mat.convertTo(axis_mat_int, CV_32SC1);
return axis_mat_int.at<int32_t>(0);
}
}

View File

@ -38,7 +38,6 @@ Mat batchwiseMatMul(
const Mat& input2,
const MatShape& input2ShapeOverride)
{
// Sanity checks before the actual MatMul
//input_1.DataType() == input_2.DataType(), "Data types of the inputs must match for MatMul");
@ -391,6 +390,15 @@ public:
OutputArrayOfArrays outputs_arr,
OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
if (inputs_arr.depth() == CV_16S)
{
forward_fallback(inputs_arr, outputs_arr, internals_arr);
return;
}
// homogenize inputs
preProcessInputs(inputs_arr);

View File

@ -984,13 +984,7 @@ struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
float one = 1.0f;
auto constant = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &one);
auto exp_node = std::make_shared<ngraph::op::v0::Exp>(node);
auto sum = std::make_shared<ngraph::op::v1::Add>(constant, exp_node, ngraph::op::AutoBroadcastType::NUMPY);
auto log_node = std::make_shared<ngraph::op::v0::Log>(sum);
auto tanh_node = std::make_shared<ngraph::op::Tanh>(log_node);
return std::make_shared<ngraph::op::v1::Multiply>(node, tanh_node);
return std::make_shared<ngraph::op::v4::Mish>(node);
}
#endif // HAVE_DNN_NGRAPH
@ -1190,10 +1184,7 @@ struct AbsValFunctor : public BaseDefaultFunctor<AbsValFunctor>
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
float coeff = -0.999999f;
// float coeff = preferableTarget == DNN_TARGET_MYRIAD ? -0.999f : -0.999999f;
auto slope = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeff);
return std::make_shared<ngraph::op::PRelu>(node, slope);
return std::make_shared<ngraph::op::Abs>(node);
}
#endif // HAVE_DNN_NGRAPH
@ -2563,11 +2554,6 @@ template<>
const char* const ReciprocalFunctor::BaseDefaultFunctor<ReciprocalFunctor>::ocl_kernel_name = "ReciprocalForward";
#define ACTIVATION_CREATOR_FOR(_Layer, _Functor, ...) \
Ptr<_Layer> _Layer::create() { \
return return Ptr<_Layer>( new ElementWiseLayer<_Functor>(_Functor()) ); }
Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
{
float negativeSlope = params.get<float>("negative_slope", 0.f);

View File

@ -191,7 +191,6 @@ public:
size_t dims_Y = shape_Y.size();
int M = shape_Y[dims_Y - 2], N = shape_Y[dims_Y - 1];
int K = trans_a ? ma : na;
int batches = std::accumulate(shape_A.begin(), shape_A.end() - 2, 1, std::multiplies<int>());
// broadcast C and copy C to output
if (have_bias) {
@ -201,9 +200,7 @@ public:
int step = M * N;
CV_CheckEQ(broadcast_C.size(), static_cast<size_t>(step), "DNN/Gemm: C is not broadcast properly");
float *ptr_y = Y.ptr<float>();
for (int i = 0; i < batches; i++) {
std::memcpy(ptr_y + i * step, broadcast_C.data(), step * sizeof(float));
}
std::memcpy(ptr_y, broadcast_C.data(), step * sizeof(float));
} else { // initialization
float *ptr_y = Y.ptr<float>();
size_t total = Y.total();
@ -212,7 +209,6 @@ public:
if (const_B) {
CV_CheckGT(packed_B.size(), static_cast<size_t>(0), "DNN/Gemm: constant B is not pre-packed");
M *= batches;
fastGemm(trans_a, M, N, K, alpha, A.ptr<const float>(), na, packed_B.data(), 1.f, Y.ptr<float>(), N, opt);
} else {
fastGemmBatched(trans_a, trans_b, alpha, A, inputs[1], 1.f, Y, opt);

View File

@ -359,11 +359,11 @@ public:
{
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
int axis = normalize_axis(axisRaw, ieInpNode.get_shape().size());
auto softmax = std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis);
if (logSoftMax)
return Ptr<BackendNode>(new InfEngineNgraphNode(std::make_shared<ngraph::op::v0::Log>(softmax)));
return Ptr<BackendNode>(new InfEngineNgraphNode(softmax));
if (logSoftMax) {
return new InfEngineNgraphNode(std::make_shared<ngraph::op::v5::LogSoftmax>(ieInpNode, axis));
} else {
return new InfEngineNgraphNode(std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis));
}
}
#endif // HAVE_DNN_NGRAPH

View File

@ -23,7 +23,7 @@ BackendNode::BackendNode(int backendId)
: backendId(backendId)
{}
BackendNode::~BackendNode() {};
BackendNode::~BackendNode() {}
BackendWrapper::BackendWrapper(int backendId, int targetId)
: backendId(backendId)

View File

@ -306,9 +306,9 @@ void ClassificationModel::classify(InputArray frame, int& classId, float& conf)
}
KeypointsModel::KeypointsModel(const String& model, const String& config)
: Model(model, config) {};
: Model(model, config) {}
KeypointsModel::KeypointsModel(const Net& network) : Model(network) {};
KeypointsModel::KeypointsModel(const Net& network) : Model(network) {}
std::vector<Point2f> KeypointsModel::estimate(InputArray frame, float thresh)
{
@ -364,9 +364,9 @@ std::vector<Point2f> KeypointsModel::estimate(InputArray frame, float thresh)
}
SegmentationModel::SegmentationModel(const String& model, const String& config)
: Model(model, config) {};
: Model(model, config) {}
SegmentationModel::SegmentationModel(const Net& network) : Model(network) {};
SegmentationModel::SegmentationModel(const Net& network) : Model(network) {}
void SegmentationModel::segment(InputArray frame, OutputArray mask)
{

View File

@ -155,11 +155,19 @@ void Net::Impl::setPreferableBackend(Net& net, int backendId)
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
backendId = DNN_BACKEND_INFERENCE_ENGINE_NGRAPH; // = getInferenceEngineBackendTypeParam();
if (netWasQuantized && backendId != DNN_BACKEND_OPENCV && backendId != DNN_BACKEND_TIMVX)
if (netWasQuantized && backendId != DNN_BACKEND_OPENCV && backendId != DNN_BACKEND_TIMVX &&
backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
{
CV_LOG_WARNING(NULL, "DNN: Only default and TIMVX backends support quantized networks");
CV_LOG_WARNING(NULL, "DNN: Only default, TIMVX and OpenVINO backends support quantized networks");
backendId = DNN_BACKEND_OPENCV;
}
#ifdef HAVE_DNN_NGRAPH
if (netWasQuantized && backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2023_0))
{
CV_LOG_WARNING(NULL, "DNN: OpenVINO 2023.0 and higher is required to supports quantized networks");
backendId = DNN_BACKEND_OPENCV;
}
#endif
if (preferableBackend != backendId)
{

View File

@ -48,7 +48,6 @@ public:
CV_Assert(basePtr_);
Net::Impl& base = *basePtr_;
CV_Assert(!base.netWasAllocated);
CV_Assert(!base.netWasQuantized);
netInputLayer = base.netInputLayer;
blobsToKeep = base.blobsToKeep;
layers = base.layers;

View File

@ -383,7 +383,7 @@ void runLayer(LayerParams& params, const std::vector<Mat>& inputs,
{
inpShapes[i] = shape(inputs[i]);
if (i > 0 && ddepth != inputs[i].depth())
CV_Error(Error::StsNotImplemented, "Mixed input data types.");
CV_Error(Error::StsNotImplemented, cv::format("Mixed input data types. Required type: %d, actual type: %d", ddepth, inputs[i].depth()));
// Quantize and Dequantize layer have different output type than input.
if (params.type != "Quantize" && params.type != "Dequantize")
@ -1502,7 +1502,7 @@ void ONNXImporter::lstm_extractConsts(LayerParams& layerParams, const opencv_onn
blob = Mat(blobShape, CV_32FC1, 0.);
}
layerParams.blobs.push_back(blob);
};
}
void ONNXImporter::lstm_add_reshape(const std::string& input_name, const std::string& output_name, int* layerShape, size_t n)
{
@ -1517,7 +1517,7 @@ void ONNXImporter::lstm_add_reshape(const std::string& input_name, const std::st
reshape_proto.add_input(input_name);
reshape_proto.add_output(output_name);
addLayer(reshapeLp, reshape_proto);
};
}
std::string ONNXImporter::lstm_add_slice(int index, const std::string& input_name, int* begin, int* end, size_t n)
{
@ -1536,7 +1536,7 @@ std::string ONNXImporter::lstm_add_slice(int index, const std::string& input_nam
addLayer(sliceLP, slice_proto);
return slice_proto.output(0);
};
}
std::string ONNXImporter::lstm_fix_dims(LayerParams& layerParams, const opencv_onnx::NodeProto& lstm_proto,
int batch_size, int num_directions, int hidden_size, bool need_y, const std::string& y_name,
@ -1564,7 +1564,7 @@ std::string ONNXImporter::lstm_fix_dims(LayerParams& layerParams, const opencv_o
addLayer(permuteLP, permute_proto);
return permute_proto.output(0);
};
}
void ONNXImporter::lstm_add_transform(int num_directions, int batch_size, int hidden_size,
int index, const std::string& input_name, const std::string& output_name)
@ -1606,7 +1606,7 @@ void ONNXImporter::lstm_add_transform(int num_directions, int batch_size, int hi
int layerShape[] = {2, batch_size, hidden_size};
lstm_add_reshape(concat_proto.output(0), output_name, layerShape, sizeof(layerShape) / sizeof(layerShape[0]));
}
};
}
void ONNXImporter::parseLSTM(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
{

View File

@ -27,6 +27,7 @@
#define INF_ENGINE_RELEASE_2021_3 2021030000
#define INF_ENGINE_RELEASE_2021_4 2021040000
#define INF_ENGINE_RELEASE_2022_1 2022010000
#define INF_ENGINE_RELEASE_2023_0 2023000000
#ifndef INF_ENGINE_RELEASE
#warning("IE version have not been provided via command-line. Using 2021.4 by default")

View File

@ -3227,7 +3227,7 @@ void TFLayerHandler::fillRegistry(const tensorflow::GraphDef& net)
}
}
printMissing();
};
}
bool TFLayerHandler::handleMissing(const tensorflow::NodeDef& layer)
{

View File

@ -151,10 +151,12 @@ TEST_P(DNNTestNetwork, ENet)
{
applyTestTag(target == DNN_TARGET_CPU ? "" : CV_TEST_TAG_MEMORY_512MB);
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
@ -482,7 +484,7 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
Mat img = imread(findDataFile("dnn/googlenet_1.png"));
Mat inp = blobFromImage(img, 1.0, Size(320, 240), Scalar(103.939, 116.779, 123.68), false, false);
// Output image has values in range [-143.526, 148.539].
float l1 = 2e-4, lInf = 2e-3;
float l1 = 2e-4, lInf = 2.4e-3;
if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
{
l1 = 0.4;
@ -875,8 +877,12 @@ TEST_P(MaxPooling, Accuracy)
Target targetId = get<1>(get<5>(GetParam()));
// https://github.com/openvinotoolkit/openvino/issues/18731
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && stride != Size(1, 1))
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && stride != Size(1, 1)) {
int ow = ceil(static_cast<float>(inSize.width + 2 * pad.width - kernel.width) / stride.width);
int oh = ceil(static_cast<float>(inSize.height + 2 * pad.height - kernel.height) / stride.height);
if (ow * stride.width >= inSize.width + pad.width || oh * stride.height >= inSize.height + pad.height)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
}
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
@ -1026,10 +1032,12 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, SoftMax, testing::Combine(
//////////////////////////////////////////////////////////////////////////////
TEST_P(Test_layers_backends, MaxPoolUnpool)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif
LayerParams pool;
pool.set("pool", "max");

View File

@ -14,6 +14,9 @@ testing::internal::ParamGenerator< tuple<Backend, Target> > dnnBackendsAndTarget
targets.push_back(make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
#ifdef HAVE_TIMVX
targets.push_back(make_tuple(DNN_BACKEND_TIMVX, DNN_TARGET_NPU));
#endif
#ifdef HAVE_INF_ENGINE
targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_CPU));
#endif
return testing::ValuesIn(targets);
}
@ -66,8 +69,6 @@ public:
outPath = _tf("onnx/data/output_" + basename);
}
ASSERT_FALSE(net.empty());
net.setPreferableBackend(backend);
net.setPreferableTarget(target);
for (int i = 0; i < numInps; i++)
inps[i] = blobFromNPY(inpPath + ((numInps > 1) ? cv::format("_%d.npy", i) : ".npy"));
@ -78,6 +79,8 @@ public:
qnet = net.quantize(inps, CV_8S, CV_8S, perChannel);
qnet.getInputDetails(inputScale, inputZp);
qnet.getOutputDetails(outputScale, outputZp);
qnet.setPreferableBackend(backend);
qnet.setPreferableTarget(target);
// Quantize inputs to int8
// int8_value = float_value/scale + zero-point
@ -98,7 +101,7 @@ public:
if (out_i.dims == 2 && ref_i.dims == 1) {
ref_i = ref_i.reshape(1, 1);
}
normAssert(ref_i, out_i, "", l1, lInf);
normAssert(ref_i, out_i, basename.c_str(), l1, lInf);
}
}
};
@ -201,10 +204,13 @@ TEST_P(Test_Int8_layers, Padding)
TEST_P(Test_Int8_layers, AvePooling)
{
testLayer("layer_pooling_ave", "Caffe", 0.0021, 0.0075);
// Some tests failed with OpenVINO due to wrong padded area calculation
if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
testLayer("layer_pooling_ave", "Caffe", 0.0021, 0.0075);
testLayer("ave_pool_same", "TensorFlow", 0.00153, 0.0041);
testLayer("average_pooling_1d", "ONNX", 0.002, 0.0048);
testLayer("average_pooling", "ONNX", 0.0014, 0.0032);
if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
testLayer("average_pooling", "ONNX", 0.0014, 0.0032);
testLayer("average_pooling_dynamic_axes", "ONNX", 0.0014, 0.006);
if (target != DNN_TARGET_CPU)
@ -220,8 +226,6 @@ TEST_P(Test_Int8_layers, MaxPooling)
throw SkipTestException("Only CPU is supported");
testLayer("pool_conv_3d", "ONNX", 0.0033, 0.0124);
/* All the below tests have MaxPooling as last layer, so computeMaxIdx is set to true
which is not supported by int8 maxpooling
testLayer("layer_pooling_max", "Caffe", 0.0021, 0.004);
testLayer("max_pool_even", "TensorFlow", 0.0048, 0.0139);
testLayer("max_pool_odd_valid", "TensorFlow", 0.0043, 0.012);
@ -231,7 +235,7 @@ TEST_P(Test_Int8_layers, MaxPooling)
testLayer("two_maxpooling_1d", "ONNX", 0.0037, 0.0052);
testLayer("maxpooling", "ONNX", 0.0034, 0.0065);
testLayer("two_maxpooling", "ONNX", 0.0025, 0.0052);
testLayer("max_pool3d", "ONNX", 0.0028, 0.0069);*/
testLayer("max_pool3d", "ONNX", 0.0028, 0.0069);
}
TEST_P(Test_Int8_layers, Reduce)
@ -326,7 +330,10 @@ TEST_P(Test_Int8_layers, DISABLED_Softmax_unfused_ONNX) // FIXIT Support 'Ident
TEST_P(Test_Int8_layers, Concat)
{
testLayer("layer_concat_shared_input", "Caffe", 0.0076, 0.029, 1, 1, true, false);
testLayer("concat_axis_1", "TensorFlow", 0.0056, 0.017);
if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
// Crashes with segfault
testLayer("concat_axis_1", "TensorFlow", 0.0056, 0.017);
}
testLayer("keras_pad_concat", "TensorFlow", 0.0032, 0.0089);
testLayer("concat_3d", "TensorFlow", 0.005, 0.014);
testLayer("concatenation", "ONNX", 0.0032, 0.009);
@ -404,10 +411,13 @@ TEST_P(Test_Int8_layers, Reshape)
testLayer("reshape_nchw", "TensorFlow", 0.0089, 0.029);
testLayer("reshape_conv", "TensorFlow", 0.035, 0.054);
testLayer("reshape_reduce", "TensorFlow", 0.0042, 0.0078);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
testLayer("reshape_reduce", "TensorFlow", 0.0053, 0.011);
else
testLayer("reshape_reduce", "TensorFlow", 0.0042, 0.0078);
testLayer("reshape_as_shape", "TensorFlow", 0.0014, 0.0028);
testLayer("reshape_no_reorder", "TensorFlow", 0.0014, 0.0028);
testLayer("shift_reshape_no_reorder", "TensorFlow", 0.0063, 0.014);
testLayer("shift_reshape_no_reorder", "TensorFlow", 0.0063, backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.016 : 0.014);
testLayer("dynamic_reshape", "ONNX", 0.0047, 0.0079);
testLayer("dynamic_reshape_opset_11", "ONNX", 0.0048, 0.0081);
testLayer("flatten_by_prod", "ONNX", 0.0048, 0.0081);
@ -495,10 +505,10 @@ TEST_P(Test_Int8_layers, Eltwise)
testLayer("conv_2_inps", "Caffe", 0.0086, 0.0232, 2, 1, true, false);
testLayer("eltwise_sub", "TensorFlow", 0.015, 0.047);
testLayer("eltwise_add_vec", "TensorFlow", 0.037, 0.21); // tflite 0.0095, 0.0365
testLayer("eltwise_add_vec", "TensorFlow", 0.037, backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.24 : 0.21); // tflite 0.0095, 0.0365
testLayer("eltwise_mul_vec", "TensorFlow", 0.173, 1.14); // tflite 0.0028, 0.017
testLayer("channel_broadcast", "TensorFlow", 0.0025, 0.0063);
testLayer("split_equals", "TensorFlow", 0.02, 0.065);
testLayer("split_equals", "TensorFlow", backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.021 : 0.02, 0.065);
testLayer("mul", "ONNX", 0.0039, 0.014);
testLayer("split_max", "ONNX", 0.004, 0.012);
}
@ -555,10 +565,10 @@ public:
Mat blob = readTensorFromONNX(findDataFile("dnn/onnx/data/input_" + basename + ".pb"));
Mat ref = readTensorFromONNX(findDataFile("dnn/onnx/data/output_" + basename + ".pb"));
Net baseNet = readNetFromONNX(onnxmodel);
baseNet.setPreferableBackend(backend);
baseNet.setPreferableTarget(target);
Net qnet = baseNet.quantize(blob, CV_32F, CV_32F, perChannel);
qnet.setPreferableBackend(backend);
qnet.setPreferableTarget(target);
qnet.setInput(blob);
Mat out = qnet.forward();
@ -703,9 +713,6 @@ TEST_P(Test_Int8_nets, AlexNet)
#else
applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
#endif
if (backend != DNN_BACKEND_OPENCV)
throw SkipTestException("Only OpenCV backend is supported");
if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
@ -746,8 +753,6 @@ TEST_P(Test_Int8_nets, GoogLeNet)
TEST_P(Test_Int8_nets, ResNet50)
{
applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
if (backend != DNN_BACKEND_OPENCV)
throw SkipTestException("Only OpenCV backend is supported");
if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
@ -778,6 +783,8 @@ TEST_P(Test_Int8_nets, DenseNet121)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
Net net = readNetFromCaffe(findDataFile("dnn/DenseNet_121.prototxt", false),
findDataFile("dnn/DenseNet_121.caffemodel", false));
@ -959,6 +966,8 @@ TEST_P(Test_Int8_nets, opencv_face_detector)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
Net net = readNetFromCaffe(findDataFile("dnn/opencv_face_detector.prototxt"),
findDataFile("dnn/opencv_face_detector.caffemodel", false));
@ -1025,7 +1034,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_resnet50)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
@ -1052,7 +1062,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_inceptionv2)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
@ -1083,6 +1094,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_vgg16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_vgg16.prototxt"),
findDataFile("dnn/VGG16_faster_rcnn_final.caffemodel", false));
@ -1110,6 +1123,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_zf)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_zf.prototxt"),
findDataFile("dnn/ZF_faster_rcnn_final.caffemodel", false));
@ -1142,6 +1157,9 @@ TEST_P(Test_Int8_nets, RFCN)
0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16);
float confThreshold = 0.8, scoreDiff = 0.15, iouDiff = 0.11;
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
iouDiff = 0.12;
}
testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
}
@ -1321,6 +1339,8 @@ TEST_P(Test_Int8_nets, YOLOv4_tiny)
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
const float confThreshold = 0.6;

View File

@ -413,10 +413,12 @@ TEST_P(Test_Caffe_layers, layer_prelu_fc)
TEST_P(Test_Caffe_layers, Reshape_Split_Slice)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif
Net net = readNetFromCaffe(_tf("reshape_and_slice_routines.prototxt"));
ASSERT_FALSE(net.empty());
@ -795,8 +797,10 @@ TEST_P(Test_Caffe_layers, DataAugmentation)
TEST_P(Test_Caffe_layers, Resample)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend != DNN_BACKEND_OPENCV)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif
testLayerUsingCaffeModels("nearest_2inps", false, false, 0.0, 0.0, 2);
testLayerUsingCaffeModels("nearest", false, false);
}

View File

@ -4,4 +4,4 @@
#include <hpx/hpx_main.hpp>
#endif
CV_TEST_MAIN("", initDNNTests());
CV_TEST_MAIN("", initDNNTests())

View File

@ -1236,4 +1236,4 @@ INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_conformance,
printOnnxConfParams
);
};
}

View File

@ -46,6 +46,13 @@
"test_conv_with_strides_and_asymmetric_padding",
"test_conv_with_strides_no_padding",
"test_conv_with_strides_padding",
"test_cumsum_1d",
"test_cumsum_1d_exclusive",
"test_cumsum_1d_reverse",
"test_cumsum_1d_reverse_exclusive",
"test_cumsum_2d_axis_0",
"test_cumsum_2d_axis_1",
"test_cumsum_2d_negative_axis",
"test_div_bcast",
"test_div_uint8",
"test_dropout_default_ratio",

View File

@ -40,6 +40,13 @@
"test_cast_STRING_to_FLOAT",
"test_castlike_FLOAT_to_STRING_expanded",
"test_castlike_STRING_to_FLOAT_expanded",
"test_cumsum_1d",
"test_cumsum_1d_exclusive",
"test_cumsum_1d_reverse",
"test_cumsum_1d_reverse_exclusive",
"test_cumsum_2d_axis_0",
"test_cumsum_2d_axis_1",
"test_cumsum_2d_negative_axis",
"test_concat_1d_axis_negative_1",
"test_div_uint8",
"test_flatten_axis0",

View File

@ -89,13 +89,6 @@
"test_convtranspose_pad",
"test_convtranspose_pads",
"test_convtranspose_with_kernel",
"test_cumsum_1d",
"test_cumsum_1d_exclusive",
"test_cumsum_1d_reverse",
"test_cumsum_1d_reverse_exclusive",
"test_cumsum_2d_axis_0",
"test_cumsum_2d_axis_1",
"test_cumsum_2d_negative_axis",
"test_dequantizelinear",
"test_dequantizelinear_axis",
"test_det_2d",
@ -547,3 +540,11 @@
"test_xor_bcast4v2d",
"test_xor_bcast4v3d",
"test_xor_bcast4v4d",
// Cumsum related issue: https://github.com/opencv/opencv/issues/24437
"test_cumsum_1d",
"test_cumsum_1d_exclusive",
"test_cumsum_1d_reverse",
"test_cumsum_1d_reverse_exclusive",
"test_cumsum_2d_axis_0",
"test_cumsum_2d_axis_1",
"test_cumsum_2d_negative_axis",

View File

@ -681,6 +681,9 @@ TEST_P(Test_ONNX_layers, Compare_GT)
testONNXModels("greater");
}
TEST_P(Test_ONNX_layers, Greater_input_dtype_int64) {
testONNXModels("greater_input_dtype_int64");
}
TEST_P(Test_ONNX_layers, Compare_LT)
{
@ -1063,10 +1066,12 @@ TEST_P(Test_ONNX_layers, ResizeUnfused)
TEST_P(Test_ONNX_layers, ResizeUnfusedTwoInputs)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif
testONNXModels("upsample_unfused_two_inputs_opset9_torch1.4", npy, 0, 0, false, true, 2);
testONNXModels("upsample_unfused_two_inputs_opset11_torch1.4", npy, 0, 0, false, true, 2);
}
@ -1170,10 +1175,12 @@ TEST_P(Test_ONNX_layers, ReduceL2)
TEST_P(Test_ONNX_layers, Split)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif
testONNXModels("split_0");
testONNXModels("split_1");
testONNXModels("split_2");
@ -1249,10 +1256,12 @@ TEST_P(Test_ONNX_layers, Softmax)
TEST_P(Test_ONNX_layers, Split_EltwiseMax)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif
testONNXModels("split_max");
}
@ -2058,12 +2067,16 @@ TEST_P(Test_ONNX_layers, Quantized_Unsqueeze)
TEST_P(Test_ONNX_layers, Quantized_Resize)
{
testONNXModels("quantized_resize_nearest");
testONNXModels("quantized_resize_bilinear", npy, 2e-4, 0.003);
testONNXModels("quantized_resize_bilinear_align", npy, 3e-4, 0.003);
double l1 = backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.0013 : 2e-4;
testONNXModels("quantized_resize_bilinear", npy, l1, 0.003);
l1 = backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.0013 : 3e-4;
testONNXModels("quantized_resize_bilinear_align", npy, l1, 0.003);
}
TEST_P(Test_ONNX_layers, Quantized_Concat)
{
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
testONNXModels("quantized_concat");
testONNXModels("quantized_concat_const_blob");
}
@ -2080,6 +2093,8 @@ TEST_P(Test_ONNX_layers, OutputRegistration)
TEST_P(Test_ONNX_layers, QLinearSoftmax)
{
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
testONNXModels("qlinearsoftmax_v11", npy, 0.002, 0.002); // 2D coerced
testONNXModels("qlinearsoftmax_v13", npy, 0.002, 0.002);
}
@ -2669,37 +2684,37 @@ TEST_P(Test_ONNX_layers, where_node)
testONNXModels("where_layer");
}
TEST_P(Test_ONNX_layers, Conformance_Gemm_all_attributes) {
TEST_P(Test_ONNX_layers, Gemm_all_attributes) {
testONNXModels("test_gemm_all_attributes", pb, 0, 0, false, true, 2);
}
TEST_P(Test_ONNX_layers, Conformance_Gemm_alpha) {
TEST_P(Test_ONNX_layers, Gemm_alpha) {
testONNXModels("test_gemm_alpha", pb, 0, 0, false, true, 2);
}
TEST_P(Test_ONNX_layers, Conformance_Gemm_beta) {
TEST_P(Test_ONNX_layers, Gemm_beta) {
testONNXModels("test_gemm_beta", pb, 0, 0, false, true, 2);
}
TEST_P(Test_ONNX_layers, Conformance_Gemm_default_matrix_bias) {
TEST_P(Test_ONNX_layers, Gemm_default_matrix_bias) {
testONNXModels("test_gemm_default_matrix_bias", pb, 0, 0, false, true, 2);
}
TEST_P(Test_ONNX_layers, Conformance_Gemm_default_no_bias) {
TEST_P(Test_ONNX_layers, Gemm_default_no_bias) {
testONNXModels("test_gemm_default_no_bias", pb, 0, 0, false, true, 2);
}
TEST_P(Test_ONNX_layers, Conformance_Gemm_default_scalar_bias) {
TEST_P(Test_ONNX_layers, Gemm_default_scalar_bias) {
testONNXModels("test_gemm_default_scalar_bias", pb, 0, 0, false, true, 2);
}
TEST_P(Test_ONNX_layers, Conformance_Gemm_default_single_elem_vector_bias) {
TEST_P(Test_ONNX_layers, Gemm_default_single_elem_vector_bias) {
testONNXModels("test_gemm_default_single_elem_vector_bias", pb, 0, 0, false, true, 2);
}
TEST_P(Test_ONNX_layers, Conformance_Gemm_default_vector_bias) {
TEST_P(Test_ONNX_layers, Gemm_default_vector_bias) {
testONNXModels("test_gemm_default_vector_bias", pb, 0, 0, false, true, 2);
}
TEST_P(Test_ONNX_layers, Conformance_Gemm_default_zero_bias) {
TEST_P(Test_ONNX_layers, Gemm_default_zero_bias) {
testONNXModels("test_gemm_default_zero_bias", pb, 0, 0, false, true, 2);
}
TEST_P(Test_ONNX_layers, Conformance_Gemm_transposeA) {
TEST_P(Test_ONNX_layers, Gemm_transposeA) {
testONNXModels("test_gemm_transposeA", pb, 0, 0, false, true, 2);
}
TEST_P(Test_ONNX_layers, Conformance_Gemm_transposeB) {
TEST_P(Test_ONNX_layers, Gemm_transposeB) {
testONNXModels("test_gemm_transposeB", pb, 0, 0, false, true, 2);
}

View File

@ -619,10 +619,12 @@ TEST_P(Test_TensorFlow_layers, pooling_reduce_sum_1_2_true)
TEST_P(Test_TensorFlow_layers, max_pool_grad)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif
runTensorFlowNet("max_pool_grad");
}
@ -1496,17 +1498,21 @@ TEST_P(Test_TensorFlow_layers, split)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif
runTensorFlowNet("split");
}
TEST_P(Test_TensorFlow_layers, split_equals)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif
runTensorFlowNet("split_equals");
}
@ -1581,7 +1587,7 @@ TEST_P(Test_TensorFlow_layers, relu6)
TEST_P(Test_TensorFlow_layers, subpixel)
{
#if defined(INF_ENGINE_RELEASE)
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
@ -1621,8 +1627,10 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_align_corners)
// TF case: align_corners=False, half_pixel_centers=True
TEST_P(Test_TensorFlow_layers, resize_bilinear_half_pixel)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif
runTensorFlowNet("resize_bilinear", false, 0.0, 0.0, false, "_half_pixel");
}
@ -1636,8 +1644,10 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_factor)
// TF case: align_corners=False, half_pixel_centers=True
TEST_P(Test_TensorFlow_layers, resize_bilinear_factor_half_pixel)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif
runTensorFlowNet("resize_bilinear_factor", false, 0.0, 0.0, false, "_half_pixel");
}

View File

@ -204,6 +204,10 @@ TEST_P(Test_TFLite, max_unpooling)
}
TEST_P(Test_TFLite, EfficientDet_int8) {
if (target != DNN_TARGET_CPU || (backend != DNN_BACKEND_OPENCV &&
backend != DNN_BACKEND_TIMVX && backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)) {
throw SkipTestException("Only OpenCV, TimVX and OpenVINO targets support INT8 on CPU");
}
Net net = readNet(findDataFile("dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", false));
net.setPreferableBackend(backend);
net.setPreferableTarget(target);

View File

@ -449,7 +449,7 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
throw SkipTestException("");
}
#endif
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
#endif

View File

@ -872,11 +872,15 @@ public:
@param nOctaveLayers Default number of sublevels per scale level
@param diffusivity Diffusivity type. DIFF_PM_G1, DIFF_PM_G2, DIFF_WEICKERT or
DIFF_CHARBONNIER
@param max_points Maximum amount of returned points. In case if image contains
more features, then the features with highest response are returned.
Negative value means no limitation.
*/
CV_WRAP static Ptr<AKAZE> create(AKAZE::DescriptorType descriptor_type = AKAZE::DESCRIPTOR_MLDB,
int descriptor_size = 0, int descriptor_channels = 3,
float threshold = 0.001f, int nOctaves = 4,
int nOctaveLayers = 4, KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2);
int nOctaveLayers = 4, KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2,
int max_points = -1);
CV_WRAP virtual void setDescriptorType(AKAZE::DescriptorType dtype) = 0;
CV_WRAP virtual AKAZE::DescriptorType getDescriptorType() const = 0;
@ -899,6 +903,9 @@ public:
CV_WRAP virtual void setDiffusivity(KAZE::DiffusivityType diff) = 0;
CV_WRAP virtual KAZE::DiffusivityType getDiffusivity() const = 0;
CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
CV_WRAP virtual void setMaxPoints(int max_points) = 0;
CV_WRAP virtual int getMaxPoints() const = 0;
};
//! @} features2d_main

View File

@ -58,7 +58,7 @@ public class AKAZEDescriptorExtractorTest extends OpenCVTestCase {
extractor.write(filename);
String truth = "%YAML:1.0\n---\nformat: 3\nname: \"Feature2D.AKAZE\"\ndescriptor: 5\ndescriptor_channels: 3\ndescriptor_size: 0\nthreshold: 1.0000000474974513e-03\noctaves: 4\nsublevels: 4\ndiffusivity: 1\n";
String truth = "%YAML:1.0\n---\nformat: 3\nname: \"Feature2D.AKAZE\"\ndescriptor: 5\ndescriptor_channels: 3\ndescriptor_size: 0\nthreshold: 1.0000000474974513e-03\noctaves: 4\nsublevels: 4\ndiffusivity: 1\nmax_points: -1\n";
String actual = readFile(filename);
actual = actual.replaceAll("e([+-])0(\\d\\d)", "e$1$2"); // NOTE: workaround for different platforms double representation
assertEquals(truth, actual);

View File

@ -61,7 +61,7 @@ namespace cv
{
public:
AKAZE_Impl(DescriptorType _descriptor_type, int _descriptor_size, int _descriptor_channels,
float _threshold, int _octaves, int _sublevels, KAZE::DiffusivityType _diffusivity)
float _threshold, int _octaves, int _sublevels, KAZE::DiffusivityType _diffusivity, int _max_points)
: descriptor(_descriptor_type)
, descriptor_channels(_descriptor_channels)
, descriptor_size(_descriptor_size)
@ -69,6 +69,7 @@ namespace cv
, octaves(_octaves)
, sublevels(_sublevels)
, diffusivity(_diffusivity)
, max_points(_max_points)
{
}
@ -98,6 +99,9 @@ namespace cv
void setDiffusivity(KAZE::DiffusivityType diff_) CV_OVERRIDE{ diffusivity = diff_; }
KAZE::DiffusivityType getDiffusivity() const CV_OVERRIDE{ return diffusivity; }
void setMaxPoints(int max_points_) CV_OVERRIDE { max_points = max_points_; }
int getMaxPoints() const CV_OVERRIDE { return max_points; }
// returns the descriptor size in bytes
int descriptorSize() const CV_OVERRIDE
{
@ -195,6 +199,12 @@ namespace cv
KeyPointsFilter::runByPixelsMask(keypoints, mask.getMat());
}
if (max_points > 0 && (int)keypoints.size() > max_points) {
std::partial_sort(keypoints.begin(), keypoints.begin() + max_points, keypoints.end(),
[](const cv::KeyPoint& k1, const cv::KeyPoint& k2) {return k1.response > k2.response;});
keypoints.erase(keypoints.begin() + max_points, keypoints.end());
}
if(descriptors.needed())
{
impl.Compute_Descriptors(keypoints, descriptors);
@ -215,6 +225,7 @@ namespace cv
fs << "octaves" << octaves;
fs << "sublevels" << sublevels;
fs << "diffusivity" << diffusivity;
fs << "max_points" << max_points;
}
void read(const FileNode& fn) CV_OVERRIDE
@ -234,6 +245,8 @@ namespace cv
sublevels = (int)fn["sublevels"];
if (!fn["diffusivity"].empty())
diffusivity = static_cast<KAZE::DiffusivityType>((int)fn["diffusivity"]);
if (!fn["max_points"].empty())
max_points = (int)fn["max_points"];
}
DescriptorType descriptor;
@ -243,15 +256,16 @@ namespace cv
int octaves;
int sublevels;
KAZE::DiffusivityType diffusivity;
int max_points;
};
Ptr<AKAZE> AKAZE::create(DescriptorType descriptor_type,
int descriptor_size, int descriptor_channels,
float threshold, int octaves,
int sublevels, KAZE::DiffusivityType diffusivity)
int sublevels, KAZE::DiffusivityType diffusivity, int max_points)
{
return makePtr<AKAZE_Impl>(descriptor_type, descriptor_size, descriptor_channels,
threshold, octaves, sublevels, diffusivity);
threshold, octaves, sublevels, diffusivity, max_points);
}
String AKAZE::getDefaultName() const

View File

@ -64,9 +64,12 @@
//! @{
/**
@brief Detects corners using the FAST algorithm, returns mask.
@param src_data,src_step Source image
@param dst_data,dst_step Destination mask
@param width,height Source image dimensions
@param src_data Source image data
@param src_step Source image step
@param dst_data Destination mask data
@param dst_step Destination mask step
@param width Source image width
@param height Source image height
@param type FAST type
*/
inline int hal_ni_FAST_dense(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, cv::FastFeatureDetector::DetectorType type) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
@ -89,8 +92,10 @@ inline int hal_ni_FAST_NMS(const uchar* src_data, size_t src_step, uchar* dst_da
/**
@brief Detects corners using the FAST algorithm.
@param src_data,src_step Source image
@param width,height Source image dimensions
@param src_data Source image data
@param src_step Source image step
@param width Source image width
@param height Source image height
@param keypoints_data Pointer to keypoints
@param keypoints_count Count of keypoints
@param threshold Threshold for keypoint

View File

@ -86,9 +86,9 @@ void image_derivatives_scharr(const cv::Mat& src, cv::Mat& dst, int xorder, int
/**
* @brief This function computes the Perona and Malik conductivity coefficient g1
* g1 = exp(-|dL|^2/k^2)
* @param Lx First order image derivative in X-direction (horizontal)
* @param Ly First order image derivative in Y-direction (vertical)
* @param dst Output image
* @param _Lx First order image derivative in X-direction (horizontal)
* @param _Ly First order image derivative in Y-direction (vertical)
* @param _dst Output image
* @param k Contrast factor parameter
*/
void pm_g1(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
@ -117,9 +117,9 @@ void pm_g1(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
/**
* @brief This function computes the Perona and Malik conductivity coefficient g2
* g2 = 1 / (1 + dL^2 / k^2)
* @param Lx First order image derivative in X-direction (horizontal)
* @param Ly First order image derivative in Y-direction (vertical)
* @param dst Output image
* @param _Lx First order image derivative in X-direction (horizontal)
* @param _Ly First order image derivative in Y-direction (vertical)
* @param _dst Output image
* @param k Contrast factor parameter
*/
void pm_g2(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
@ -146,9 +146,9 @@ void pm_g2(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
/* ************************************************************************* */
/**
* @brief This function computes Weickert conductivity coefficient gw
* @param Lx First order image derivative in X-direction (horizontal)
* @param Ly First order image derivative in Y-direction (vertical)
* @param dst Output image
* @param _Lx First order image derivative in X-direction (horizontal)
* @param _Ly First order image derivative in Y-direction (vertical)
* @param _dst Output image
* @param k Contrast factor parameter
* @note For more information check the following paper: J. Weickert
* Applications of nonlinear diffusion in image processing and computer vision,
@ -183,9 +183,9 @@ void weickert_diffusivity(InputArray _Lx, InputArray _Ly, OutputArray _dst, floa
/**
* @brief This function computes Charbonnier conductivity coefficient gc
* gc = 1 / sqrt(1 + dL^2 / k^2)
* @param Lx First order image derivative in X-direction (horizontal)
* @param Ly First order image derivative in Y-direction (vertical)
* @param dst Output image
* @param _Lx First order image derivative in X-direction (horizontal)
* @param _Ly First order image derivative in Y-direction (vertical)
* @param _dst Output image
* @param k Contrast factor parameter
* @note For more information check the following paper: J. Weickert
* Applications of nonlinear diffusion in image processing and computer vision,
@ -323,7 +323,7 @@ void compute_scharr_derivatives(const cv::Mat& src, cv::Mat& dst, int xorder, in
* @param _ky Vertical kernel values
* @param dx Derivative order in X-direction (horizontal)
* @param dy Derivative order in Y-direction (vertical)
* @param scale_ Scale factor or derivative size
* @param scale Scale factor or derivative size
*/
void compute_derivative_kernels(cv::OutputArray _kx, cv::OutputArray _ky, int dx, int dy, int scale) {
CV_INSTRUMENT_REGION();
@ -415,7 +415,7 @@ private:
/* ************************************************************************* */
/**
* @brief This function performs a scalar non-linear diffusion step
* @param Ld2 Output image in the evolution
* @param Ld Output image in the evolution
* @param c Conductivity image
* @param Lstep Previous image in the evolution
* @param stepsize The step size in time units
@ -490,7 +490,7 @@ void nld_step_scalar(cv::Mat& Ld, const cv::Mat& c, cv::Mat& Lstep, float stepsi
/* ************************************************************************* */
/**
* @brief This function downsamples the input image using OpenCV resize
* @param img Input image to be downsampled
* @param src Input image to be downsampled
* @param dst Output image with half of the resolution of the input image
*/
void halfsample_image(const cv::Mat& src, cv::Mat& dst) {

View File

@ -6,7 +6,7 @@
* @brief This function computes the value of a 2D Gaussian function
* @param x X Position
* @param y Y Position
* @param sig Standard Deviation
* @param sigma Standard Deviation
*/
inline float gaussian(float x, float y, float sigma) {
return expf(-(x*x + y*y) / (2.0f*sigma*sigma));

View File

@ -80,7 +80,6 @@ public:
* @param inputData dataset containing the points to index
* @param params Index parameters
* @param d Distance functor
* @return
*/
CompositeIndex(const Matrix<ElementType>& inputData, const IndexParams& params = CompositeIndexParams(),
Distance d = Distance()) : index_params_(params)

View File

@ -97,7 +97,6 @@ public:
}
/** @brief set one bit to 0
* @param index
*/
void reset(size_t index)
{
@ -108,7 +107,6 @@ public:
* This function is useful when resetting a given set of bits so that the
* whole bitset ends up being 0: if that's the case, we don't care about setting
* other bits to 0
* @param index
*/
void reset_block(size_t index)
{
@ -116,7 +114,6 @@ public:
}
/** resize the bitset so that it contains at least sz bits
* @param sz
*/
void resize(size_t sz)
{

View File

@ -101,7 +101,6 @@ public:
* Print log message
* @param level Log level
* @param fmt Message format
* @return
*/
static int log(int level, const char* fmt, ...)
{

View File

@ -214,8 +214,6 @@ public:
}
/** Get a bucket given the key
* @param key
* @return
*/
inline const Bucket* getBucketFromKey(BucketKey key) const
{
@ -253,7 +251,6 @@ public:
}
/** Get statistics about the table
* @return
*/
LshStats getStats() const;

Some files were not shown because too many files have changed in this diff Show More