mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
Merge branch 4.x
This commit is contained in:
commit
fdab565711
80
3rdparty/libtengine/tengine.cmake
vendored
80
3rdparty/libtengine/tengine.cmake
vendored
@ -1,80 +0,0 @@
|
||||
# COPYRIGHT
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# License); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
# Copyright (c) 2020, OPEN AI LAB
|
||||
# Author: qtang@openailab.com or https://github.com/BUG1989
|
||||
# qli@openailab.com
|
||||
# sqfu@openailab.com
|
||||
|
||||
SET(TENGINE_COMMIT_VERSION "e89cf8870de2ff0a80cfe626c0b52b2a16fb302e")
|
||||
SET(OCV_TENGINE_DIR "${OpenCV_BINARY_DIR}/3rdparty/libtengine")
|
||||
SET(OCV_TENGINE_SOURCE_PATH "${OCV_TENGINE_DIR}/Tengine-${TENGINE_COMMIT_VERSION}")
|
||||
|
||||
IF(EXISTS "${OCV_TENGINE_SOURCE_PATH}")
|
||||
MESSAGE(STATUS "Tengine is exist already at: ${OCV_TENGINE_SOURCE_PATH}")
|
||||
|
||||
SET(Tengine_FOUND ON)
|
||||
SET(BUILD_TENGINE ON)
|
||||
ELSE()
|
||||
SET(OCV_TENGINE_FILENAME "${TENGINE_COMMIT_VERSION}.zip")#name
|
||||
SET(OCV_TENGINE_URL "https://github.com/OAID/Tengine/archive/") #url
|
||||
SET(tengine_md5sum 23f61ebb1dd419f1207d8876496289c5) #md5sum
|
||||
|
||||
ocv_download(FILENAME ${OCV_TENGINE_FILENAME}
|
||||
HASH ${tengine_md5sum}
|
||||
URL
|
||||
"${OPENCV_TENGINE_URL}"
|
||||
"$ENV{OPENCV_TENGINE_URL}"
|
||||
"${OCV_TENGINE_URL}"
|
||||
DESTINATION_DIR "${OCV_TENGINE_DIR}"
|
||||
ID TENGINE
|
||||
STATUS res
|
||||
UNPACK RELATIVE_URL)
|
||||
|
||||
if (NOT res)
|
||||
MESSAGE(STATUS "TENGINE DOWNLOAD FAILED. Turning Tengine_FOUND off.")
|
||||
SET(Tengine_FOUND OFF)
|
||||
else ()
|
||||
MESSAGE(STATUS "TENGINE DOWNLOAD success . ")
|
||||
|
||||
SET(Tengine_FOUND ON)
|
||||
SET(BUILD_TENGINE ON)
|
||||
endif()
|
||||
ENDIF()
|
||||
|
||||
if(BUILD_TENGINE)
|
||||
SET(HAVE_TENGINE 1)
|
||||
|
||||
if(NOT ANDROID)
|
||||
# linux system
|
||||
if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm)
|
||||
SET(TENGINE_TOOLCHAIN_FLAG "-march=armv7-a")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64) ## AARCH64
|
||||
SET(TENGINE_TOOLCHAIN_FLAG "-march=armv8-a")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
SET(BUILT_IN_OPENCV ON) ## set for tengine compile discern .
|
||||
SET(Tengine_INCLUDE_DIR "${OCV_TENGINE_SOURCE_PATH}/include" CACHE INTERNAL "")
|
||||
if(EXISTS "${OCV_TENGINE_SOURCE_PATH}/CMakeLists.txt")
|
||||
add_subdirectory("${OCV_TENGINE_SOURCE_PATH}" "${OCV_TENGINE_DIR}/build")
|
||||
else()
|
||||
message(WARNING "TENGINE: Missing 'CMakeLists.txt' in source code package: ${OCV_TENGINE_SOURCE_PATH}")
|
||||
endif()
|
||||
SET(Tengine_LIB "tengine" CACHE INTERNAL "")
|
||||
endif()
|
4
3rdparty/readme.txt
vendored
4
3rdparty/readme.txt
vendored
@ -39,7 +39,9 @@ libspng Portable Network Graphics library.
|
||||
libtiff Tag Image File Format (TIFF) Software
|
||||
Copyright (c) 1988-1997 Sam Leffler
|
||||
Copyright (c) 1991-1997 Silicon Graphics, Inc.
|
||||
See libtiff home page http://www.libtiff.org/
|
||||
See libtiff home page #1 http://www.simplesystems.org/libtiff/
|
||||
#2 https://libtiff.gitlab.io/libtiff/
|
||||
#3 http://libtiff.maptools.org/
|
||||
for details and links to the source code
|
||||
|
||||
WITH_TIFF CMake option must be ON to add libtiff & zlib support to imgcodecs.
|
||||
|
@ -463,9 +463,6 @@ OCV_OPTION(WITH_ANDROID_MEDIANDK "Use Android Media NDK for Video I/O (Android)"
|
||||
OCV_OPTION(WITH_ANDROID_NATIVE_CAMERA "Use Android NDK for Camera I/O (Android)" (ANDROID_NATIVE_API_LEVEL GREATER 23)
|
||||
VISIBLE_IF ANDROID
|
||||
VERIFY HAVE_ANDROID_NATIVE_CAMERA)
|
||||
OCV_OPTION(WITH_TENGINE "Include Arm Inference Tengine support" OFF
|
||||
VISIBLE_IF (ARM OR AARCH64) AND (UNIX OR ANDROID) AND NOT IOS
|
||||
VERIFY HAVE_TENGINE)
|
||||
OCV_OPTION(WITH_ONNX "Include Microsoft ONNX Runtime support" OFF
|
||||
VISIBLE_IF TRUE
|
||||
VERIFY HAVE_ONNX)
|
||||
@ -768,9 +765,6 @@ if(WITH_LAPACK)
|
||||
endif()
|
||||
include(cmake/OpenCVFindProtobuf.cmake)
|
||||
include(cmake/OpenCVDetectFlatbuffers.cmake)
|
||||
if(WITH_TENGINE)
|
||||
include(cmake/OpenCVFindTengine.cmake)
|
||||
endif()
|
||||
if(WITH_TIMVX)
|
||||
include(cmake/OpenCVFindTIMVX.cmake)
|
||||
endif()
|
||||
@ -1623,10 +1617,6 @@ if(WITH_VA OR HAVE_VA)
|
||||
status(" VA:" HAVE_VA THEN "YES" ELSE NO)
|
||||
endif()
|
||||
|
||||
if(WITH_TENGINE OR HAVE_TENGINE)
|
||||
status(" Tengine:" HAVE_TENGINE THEN "YES (${TENGINE_LIBRARIES})" ELSE NO)
|
||||
endif()
|
||||
|
||||
if(WITH_LAPACK OR HAVE_LAPACK)
|
||||
status(" Lapack:" HAVE_LAPACK THEN "YES (${LAPACK_LIBRARIES} ${LAPACK_VERSION})" ELSE NO)
|
||||
endif()
|
||||
@ -1693,6 +1683,10 @@ else()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(BUILD_opencv_dnn AND OPENCV_DNN_BACKEND_DEFAULT)
|
||||
status(" Default DNN backend:" ${OPENCV_DNN_BACKEND_DEFAULT})
|
||||
endif()
|
||||
|
||||
if(WITH_EIGEN OR HAVE_EIGEN)
|
||||
status(" Eigen:" HAVE_EIGEN THEN "YES (ver ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})" ELSE NO)
|
||||
endif()
|
||||
|
@ -60,6 +60,7 @@ Created by: Puttemans Steven - April 2016
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
using namespace std;
|
||||
using namespace cv;
|
||||
|
@ -251,7 +251,7 @@ if(NOT ${found})
|
||||
set(${include_path} "${_include_path}" CACHE INTERNAL "")
|
||||
set(${include_dir} "${_include_dir}" CACHE PATH "Python include dir")
|
||||
set(${include_dir2} "${_include_dir2}" CACHE PATH "Python include dir 2")
|
||||
set(${packages_path} "${_packages_path}" CACHE PATH "Where to install the python packages.")
|
||||
set(${packages_path} "${_packages_path}" CACHE STRING "Where to install the python packages.")
|
||||
set(${numpy_include_dirs} ${_numpy_include_dirs} CACHE PATH "Path to numpy headers")
|
||||
set(${numpy_version} "${_numpy_version}" CACHE INTERNAL "")
|
||||
endif()
|
||||
|
@ -1,78 +0,0 @@
|
||||
# COPYRIGHT
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# License); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
# Copyright (c) 2020, OPEN AI LAB
|
||||
# Author: qtang@openailab.com or https://github.com/BUG1989
|
||||
#
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Path for Tengine binaries
|
||||
# ----------------------------------------------------------------------------
|
||||
set(OPENCV_LIBTENGINE_ROOT_DIR "" CACHE PATH "Path to TENGINE binaries installation")
|
||||
|
||||
IF(OPENCV_LIBTENGINE_ROOT_DIR AND NOT BUILD_TENGINE)
|
||||
|
||||
MESSAGE(STATUS "TENGINE:-- Use binaries at ${OPENCV_LIBTENGINE_ROOT_DIR}")
|
||||
|
||||
SET(Tengine_FOUND ON)
|
||||
set(BUILD_TENGINE OFF)
|
||||
|
||||
SET(Tengine_INCLUDE_DIR "${OPENCV_LIBTENGINE_ROOT_DIR}/include" CACHE PATH "TENGINE include dir")
|
||||
SET(Tengine_LIB "${OPENCV_LIBTENGINE_ROOT_DIR}/lib/libtengine.a" CACHE PATH "TENGINE library dir")
|
||||
|
||||
ELSE()
|
||||
IF(ANDROID)
|
||||
IF(OPENCV_TENGINE_FORCE_ANDROID)
|
||||
# nothing, use Android
|
||||
ELSEIF(OPENCV_TENGINE_SKIP_ANDROID)
|
||||
set(Tengine_FOUND OFF)
|
||||
set(HAVE_TENGINE FALSE)
|
||||
return()
|
||||
ELSEIF(NOT DEFINED ANDROID_NDK_REVISION)
|
||||
MESSAGE(STATUS "Android NDK version Tengine not support: ANDROID_NDK_REVISION is not defined")
|
||||
set(Tengine_FOUND OFF)
|
||||
set(HAVE_TENGINE FALSE)
|
||||
return()
|
||||
ELSEIF(ANDROID_NDK_REVISION VERSION_LESS 14)
|
||||
MESSAGE(STATUS "Android NDK version Tengine not support: ANDROID_NDK_REVISION=${ANDROID_NDK_REVISION}")
|
||||
set(Tengine_FOUND OFF)
|
||||
set(HAVE_TENGINE FALSE)
|
||||
return()
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
MESSAGE(STATUS "TENGINE:-- Build Tengine from source code. ")
|
||||
include("${OpenCV_SOURCE_DIR}/3rdparty/libtengine/tengine.cmake")
|
||||
ENDIF()
|
||||
|
||||
IF(NOT Tengine_LIB)
|
||||
SET(Tengine_FOUND OFF)
|
||||
MESSAGE(STATUS "#### Could not find Tengine lib. Turning Tengine_FOUND off")
|
||||
ENDIF()
|
||||
|
||||
IF (Tengine_FOUND)
|
||||
MESSAGE(STATUS "Found Tengine include: ${Tengine_INCLUDE_DIR}")
|
||||
MESSAGE(STATUS "Found Tengine libraries: ${Tengine_LIB}")
|
||||
set(HAVE_TENGINE 1)
|
||||
set(TENGINE_LIBRARIES ${Tengine_LIB})
|
||||
set(TENGINE_INCLUDE_DIRS ${Tengine_INCLUDE_DIR})
|
||||
ENDIF (Tengine_FOUND)
|
||||
|
||||
MARK_AS_ADVANCED(
|
||||
Tengine_INCLUDE_DIR
|
||||
Tengine_LIB
|
||||
)
|
@ -1,15 +1,12 @@
|
||||
# Gitlab-style mirror
|
||||
# CMake scripts look for opencv/opencv_3rdparty,
|
||||
# OAID/Tengine, 01org/tbb(oneAPI/oneTBB), opencv/ade
|
||||
# 01org/tbb(oneAPI/oneTBB), opencv/ade
|
||||
# from OPENCV_DOWNLOAD_MIRROR
|
||||
ocv_update(OPENCV_DOWNLOAD_MIRROR_URL "")
|
||||
|
||||
######
|
||||
# Download via commit id
|
||||
######
|
||||
# Tengine
|
||||
ocv_update(TENGINE_PKG_MD5_CUSTOM "")
|
||||
ocv_update(TENGINE_PKG_MD5_ORIGINAL 23f61ebb1dd419f1207d8876496289c5) # same as tengine_md5sum for TENGINE commit of e89cf8870de2ff0a80cfe626c0b52b2a16fb302e
|
||||
# NVIDIA_OPTICAL_FLOW
|
||||
ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_GITCODE "")
|
||||
ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_ORIGINAL a73cd48b18dcc0cc8933b30796074191)
|
||||
@ -77,7 +74,7 @@ else()
|
||||
ocv_download_url_custom_usercontent(opencv)
|
||||
elseif(DL_ID STREQUAL "wechat_qrcode")
|
||||
ocv_download_url_gitcode_usercontent(WeChatCV)
|
||||
elseif((DL_ID STREQUAL "TENGINE") OR (DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
|
||||
elseif((DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
|
||||
ocv_download_url_custom_archive_commit_id()
|
||||
elseif(DL_ID STREQUAL "TBB")
|
||||
ocv_download_url_custom_archive_release()
|
||||
|
@ -1,9 +1,6 @@
|
||||
######
|
||||
# Download via commit id
|
||||
######
|
||||
# Tengine
|
||||
ocv_update(TENGINE_PKG_MD5_GITCODE 1b5908632b557275cd6e85b0c03f9690)
|
||||
ocv_update(TENGINE_PKG_MD5_ORIGINAL 23f61ebb1dd419f1207d8876496289c5) # same as tengine_md5sum for TENGINE commit of e89cf8870de2ff0a80cfe626c0b52b2a16fb302e
|
||||
# NVIDIA_OPTICAL_FLOW
|
||||
ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_GITCODE 8d5b7eeb24d6ca9c6bcfdff4196d5b47)
|
||||
ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_ORIGINAL a73cd48b18dcc0cc8933b30796074191)
|
||||
@ -74,7 +71,7 @@ if((DL_ID STREQUAL "FFMPEG") OR (DL_ID STREQUAL "IPPICV") OR (DL_ID STREQUAL "da
|
||||
ocv_download_url_gitcode_usercontent(opencv)
|
||||
elseif(DL_ID STREQUAL "wechat_qrcode")
|
||||
ocv_download_url_gitcode_usercontent(mirrors/WeChatCV)
|
||||
elseif((DL_ID STREQUAL "TENGINE") OR (DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
|
||||
elseif((DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
|
||||
ocv_download_url_gitcode_archive_commit_id()
|
||||
elseif(DL_ID STREQUAL "TBB")
|
||||
ocv_download_url_gitcode_archive_release(OPENCV_TBB_SUBDIR)
|
||||
|
@ -224,6 +224,16 @@ Following options can be used to produce special builds with instrumentation or
|
||||
@see [Link time optimization](https://gcc.gnu.org/wiki/LinkTimeOptimization)
|
||||
@see [ThinLTO](https://clang.llvm.org/docs/ThinLTO.html)
|
||||
|
||||
## Enable IPP optimization
|
||||
|
||||
Following options can be used to enables IPP optimizations for each functions but increases the size of the opencv library. All options are disabled by default.
|
||||
|
||||
| Option | Functions | + roughly size |
|
||||
| -------| --------- | -------------- |
|
||||
| `OPENCV_IPP_GAUSSIAN_BLUR` | GaussianBlur() | +8Mb |
|
||||
| `OPENCV_IPP_MEAN` | mean() / meanStdDev() | +0.2Mb |
|
||||
| `OPENCV_IPP_MINMAX` | minMaxLoc() / minMaxIdx() | +0.2Mb |
|
||||
| `OPENCV_IPP_SUM` | sum() | +0.1Mb |
|
||||
|
||||
# Functional features and dependencies {#tutorial_config_reference_func}
|
||||
|
||||
@ -484,7 +494,6 @@ OpenCV have own DNN inference module which have own build-in engine, but can als
|
||||
| `OPENCV_DNN_CUDA` | _OFF_ | Enable CUDA backend. [CUDA](https://en.wikipedia.org/wiki/CUDA), CUBLAS and [CUDNN](https://developer.nvidia.com/cudnn) must be installed. |
|
||||
| `WITH_HALIDE` | _OFF_ | Use experimental [Halide](https://en.wikipedia.org/wiki/Halide_(programming_language)) backend which can generate optimized code for dnn-layers at runtime. Halide must be installed. |
|
||||
| `WITH_VULKAN` | _OFF_ | Enable experimental [Vulkan](https://en.wikipedia.org/wiki/Vulkan_(API)) backend. Does not require additional dependencies, but can use external Vulkan headers (`VULKAN_INCLUDE_DIRS`). |
|
||||
| `WITH_TENGINE` | _OFF_ | Enable experimental [Tengine](https://github.com/OAID/Tengine) backend for ARM CPUs. Tengine library must be installed. |
|
||||
|
||||
|
||||
# Installation layout {#tutorial_config_reference_install}
|
||||
@ -566,6 +575,7 @@ Following options can be used to change installation layout for common scenarios
|
||||
| ------ | ------- | ----------- |
|
||||
| `OPENCV_ENABLE_NONFREE` | _OFF_ | Some algorithms included in the library are known to be protected by patents and are disabled by default. |
|
||||
| `OPENCV_FORCE_3RDPARTY_BUILD`| _OFF_ | Enable all `BUILD_` options at once. |
|
||||
| `OPENCV_IPP_ENABLE_ALL`| _OFF_ | Enable all `OPENCV_IPP_` options at once. |
|
||||
| `ENABLE_CCACHE` | _ON_ (on Unix-like platforms) | Enable [ccache](https://en.wikipedia.org/wiki/Ccache) auto-detection. This tool wraps compiler calls and caches results, can significantly improve re-compilation time. |
|
||||
| `ENABLE_PRECOMPILED_HEADERS` | _ON_ (for MSVC) | Enable precompiled headers support. Improves build time. |
|
||||
| `BUILD_DOCS` | _OFF_ | Enable documentation build (_doxygen_, _doxygen_cpp_, _doxygen_python_, _doxygen_javadoc_ targets). [Doxygen](http://www.doxygen.org/index.html) must be installed for C++ documentation build. Python and [BeautifulSoup4](https://en.wikipedia.org/wiki/Beautiful_Soup_(HTML_parser)) must be installed for Python documentation build. Javadoc and Ant must be installed for Java documentation build (part of Java SDK). |
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "precomp.hpp"
|
||||
#include "ap3p.h"
|
||||
#include "polynom_solver.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <complex>
|
||||
@ -7,67 +8,11 @@
|
||||
static inline double cbrt(double x) { return (double)cv::cubeRoot((float)x); };
|
||||
#endif
|
||||
|
||||
namespace cv {
|
||||
|
||||
static
|
||||
void solveQuartic(const double *factors, double *realRoots)
|
||||
{
|
||||
const double &a4 = factors[0];
|
||||
const double &a3 = factors[1];
|
||||
const double &a2 = factors[2];
|
||||
const double &a1 = factors[3];
|
||||
const double &a0 = factors[4];
|
||||
|
||||
double a4_2 = a4 * a4;
|
||||
double a3_2 = a3 * a3;
|
||||
double a4_3 = a4_2 * a4;
|
||||
double a2a4 = a2 * a4;
|
||||
|
||||
double p4 = (8 * a2a4 - 3 * a3_2) / (8 * a4_2);
|
||||
double q4 = (a3_2 * a3 - 4 * a2a4 * a3 + 8 * a1 * a4_2) / (8 * a4_3);
|
||||
double r4 = (256 * a0 * a4_3 - 3 * (a3_2 * a3_2) - 64 * a1 * a3 * a4_2 + 16 * a2a4 * a3_2) / (256 * (a4_3 * a4));
|
||||
|
||||
double p3 = ((p4 * p4) / 12 + r4) / 3; // /=-3
|
||||
double q3 = (72 * r4 * p4 - 2 * p4 * p4 * p4 - 27 * q4 * q4) / 432; // /=2
|
||||
|
||||
double t; // *=2
|
||||
std::complex<double> w;
|
||||
if (q3 >= 0)
|
||||
w = -std::sqrt(static_cast<std::complex<double> >(q3 * q3 - p3 * p3 * p3)) - q3;
|
||||
else
|
||||
w = std::sqrt(static_cast<std::complex<double> >(q3 * q3 - p3 * p3 * p3)) - q3;
|
||||
if (w.imag() == 0.0) {
|
||||
w.real(std::cbrt(w.real()));
|
||||
t = 2.0 * (w.real() + p3 / w.real());
|
||||
} else {
|
||||
w = pow(w, 1.0 / 3);
|
||||
t = 4.0 * w.real();
|
||||
}
|
||||
|
||||
std::complex<double> sqrt_2m = sqrt(static_cast<std::complex<double> >(-2 * p4 / 3 + t));
|
||||
double B_4A = -a3 / (4 * a4);
|
||||
double complex1 = 4 * p4 / 3 + t;
|
||||
#if defined(__clang__) && defined(__arm__) && (__clang_major__ == 3 || __clang_major__ == 4) && !defined(__ANDROID__)
|
||||
// details: https://github.com/opencv/opencv/issues/11135
|
||||
// details: https://github.com/opencv/opencv/issues/11056
|
||||
std::complex<double> complex2 = 2 * q4;
|
||||
complex2 = std::complex<double>(complex2.real() / sqrt_2m.real(), 0);
|
||||
#else
|
||||
std::complex<double> complex2 = 2 * q4 / sqrt_2m;
|
||||
#endif
|
||||
double sqrt_2m_rh = sqrt_2m.real() / 2;
|
||||
double sqrt1 = sqrt(-(complex1 + complex2)).real() / 2;
|
||||
realRoots[0] = B_4A + sqrt_2m_rh + sqrt1;
|
||||
realRoots[1] = B_4A + sqrt_2m_rh - sqrt1;
|
||||
double sqrt2 = sqrt(-(complex1 - complex2)).real() / 2;
|
||||
realRoots[2] = B_4A - sqrt_2m_rh + sqrt2;
|
||||
realRoots[3] = B_4A - sqrt_2m_rh - sqrt2;
|
||||
}
|
||||
|
||||
static void polishQuarticRoots(const double *coeffs, double *roots) {
|
||||
namespace {
|
||||
void polishQuarticRoots(const double *coeffs, double *roots, int nb_roots) {
|
||||
const int iterations = 2;
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
for (int j = 0; j < nb_roots; ++j) {
|
||||
double error =
|
||||
(((coeffs[0] * roots[j] + coeffs[1]) * roots[j] + coeffs[2]) * roots[j] + coeffs[3]) * roots[j] +
|
||||
coeffs[4];
|
||||
@ -124,7 +69,9 @@ inline void mat_mult(const double a[3][3], const double b[3][3], double result[3
|
||||
result[2][1] = a[2][0] * b[0][1] + a[2][1] * b[1][1] + a[2][2] * b[2][1];
|
||||
result[2][2] = a[2][0] * b[0][2] + a[2][1] * b[1][2] + a[2][2] * b[2][2];
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv {
|
||||
void ap3p::init_inverse_parameters() {
|
||||
inv_fx = 1. / fx;
|
||||
inv_fy = 1. / fy;
|
||||
@ -228,8 +175,9 @@ int ap3p::computePoses(const double featureVectors[3][4],
|
||||
2 * (g6 * g7 - g1 * g2 - g3 * g4),
|
||||
g7 * g7 - g2 * g2 - g4 * g4};
|
||||
double s[4];
|
||||
solveQuartic(coeffs, s);
|
||||
polishQuarticRoots(coeffs, s);
|
||||
int nb_roots = solve_deg4(coeffs[0], coeffs[1], coeffs[2], coeffs[3], coeffs[4],
|
||||
s[0], s[1], s[2], s[3]);
|
||||
polishQuarticRoots(coeffs, s, nb_roots);
|
||||
|
||||
double temp[3];
|
||||
vect_cross(k1, nl, temp);
|
||||
@ -255,7 +203,7 @@ int ap3p::computePoses(const double featureVectors[3][4],
|
||||
double reproj_errors[4];
|
||||
|
||||
int nb_solutions = 0;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
for (int i = 0; i < nb_roots; ++i) {
|
||||
double ctheta1p = s[i];
|
||||
if (abs(ctheta1p) > 1)
|
||||
continue;
|
||||
|
@ -239,7 +239,8 @@ public:
|
||||
// (5) Compute the left eigenvectors of the action matrix
|
||||
Eigen::EigenSolver<Eigen::Matrix<double, 10, 10>> eigensolver(action_mat_eig);
|
||||
const Eigen::VectorXcd &eigenvalues = eigensolver.eigenvalues();
|
||||
const auto * const eig_vecs_ = (double *) eigensolver.eigenvectors().real().data();
|
||||
const Eigen::MatrixXcd eigenvectors = eigensolver.eigenvectors();
|
||||
const auto * const eig_vecs_ = (double *) eigenvectors.data();
|
||||
#else
|
||||
Matx<double, 10, 10> A = constraint_mat.colRange(0, 10),
|
||||
B = constraint_mat.colRange(10, 20), eliminated_mat;
|
||||
|
@ -115,8 +115,8 @@ TEST_P(EstimateAffine2D, testNPoints)
|
||||
|
||||
EXPECT_NEAR(0., cvtest::norm(aff_est, aff, NORM_INF), 1e-4);
|
||||
|
||||
bool inliers_good = count(inliers.begin(), inliers.end(), 1) == m &&
|
||||
m == accumulate(inliers.begin(), inliers.begin() + m, 0);
|
||||
bool inliers_good = std::count(inliers.begin(), inliers.end(), 1) == m &&
|
||||
m == std::accumulate(inliers.begin(), inliers.begin() + m, 0);
|
||||
|
||||
EXPECT_TRUE(inliers_good);
|
||||
}
|
||||
|
@ -161,8 +161,8 @@ bool CV_Affine3D_EstTest::testNPoints()
|
||||
return false;
|
||||
}
|
||||
|
||||
bool outl_good = count(outl.begin(), outl.end(), 1) == m &&
|
||||
m == accumulate(outl.begin(), outl.begin() + m, 0);
|
||||
bool outl_good = std::count(outl.begin(), outl.end(), 1) == m &&
|
||||
m == std::accumulate(outl.begin(), outl.begin() + m, 0);
|
||||
|
||||
if (!outl_good)
|
||||
{
|
||||
|
@ -125,8 +125,8 @@ TEST_P(EstimateAffinePartial2D, testNPoints)
|
||||
|
||||
EXPECT_NEAR(0., cvtest::norm(aff_est, aff, NORM_INF), 1e-4);
|
||||
|
||||
bool inliers_good = count(inliers.begin(), inliers.end(), 1) == m &&
|
||||
m == accumulate(inliers.begin(), inliers.begin() + m, 0);
|
||||
bool inliers_good = std::count(inliers.begin(), inliers.end(), 1) == m &&
|
||||
m == std::accumulate(inliers.begin(), inliers.begin() + m, 0);
|
||||
|
||||
EXPECT_TRUE(inliers_good);
|
||||
}
|
||||
|
@ -41,6 +41,7 @@
|
||||
//M*/
|
||||
|
||||
#include "test_precomp.hpp"
|
||||
#include "opencv2/core/utils/logger.hpp"
|
||||
|
||||
namespace opencv_test { namespace {
|
||||
|
||||
@ -2259,4 +2260,65 @@ TEST(Calib3d_SolvePnP, inputShape)
|
||||
}
|
||||
}
|
||||
|
||||
bool hasNan(const cv::Mat& mat)
|
||||
{
|
||||
bool has = false;
|
||||
if (mat.type() == CV_32F)
|
||||
{
|
||||
for(int i = 0; i < static_cast<int>(mat.total()); i++)
|
||||
has |= cvIsNaN(mat.at<float>(i)) != 0;
|
||||
}
|
||||
else if (mat.type() == CV_64F)
|
||||
{
|
||||
for(int i = 0; i < static_cast<int>(mat.total()); i++)
|
||||
has |= cvIsNaN(mat.at<double>(i)) != 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
has = true;
|
||||
CV_LOG_ERROR(NULL, "check hasNan called with unsupported type!");
|
||||
}
|
||||
|
||||
return has;
|
||||
}
|
||||
|
||||
TEST(AP3P, ctheta1p_nan_23607)
|
||||
{
|
||||
// the task is not well defined and may not converge (empty R, t) or should
|
||||
// converge to some non-NaN solution
|
||||
const std::array<cv::Point2d, 3> cameraPts = {
|
||||
cv::Point2d{0.042784865945577621, 0.59844839572906494},
|
||||
cv::Point2d{-0.028428621590137482, 0.60354739427566528},
|
||||
cv::Point2d{0.0046037044376134872, 0.70674681663513184}
|
||||
};
|
||||
const std::array<cv::Point3d, 3> modelPts = {
|
||||
cv::Point3d{-0.043258000165224075, 0.020459245890378952, -0.0069921980611979961},
|
||||
cv::Point3d{-0.045648999512195587, 0.0029820732306689024, 0.0079000638797879219},
|
||||
cv::Point3d{-0.043276999145746231, -0.013622495345771313, 0.0080113131552934647}
|
||||
};
|
||||
|
||||
std::vector<Mat> R, t;
|
||||
solveP3P(modelPts, cameraPts, Mat::eye(3, 3, CV_64F), Mat(), R, t, SOLVEPNP_AP3P);
|
||||
|
||||
EXPECT_EQ(R.size(), 2ul);
|
||||
EXPECT_EQ(t.size(), 2ul);
|
||||
|
||||
// Try apply rvec and tvec to get model points from camera points.
|
||||
Mat pts = Mat(modelPts).reshape(1, 3);
|
||||
Mat expected = Mat(cameraPts).reshape(1, 3);
|
||||
for (size_t i = 0; i < R.size(); ++i) {
|
||||
EXPECT_TRUE(!hasNan(R[i]));
|
||||
EXPECT_TRUE(!hasNan(t[i]));
|
||||
|
||||
Mat transform;
|
||||
cv::Rodrigues(R[i], transform);
|
||||
Mat res = pts * transform.t();
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
res.row(j) += t[i].reshape(1, 1);
|
||||
res.row(j) /= res.row(j).at<double>(2);
|
||||
}
|
||||
EXPECT_LE(cvtest::norm(res.colRange(0, 2), expected, NORM_INF), 3e-16);
|
||||
}
|
||||
}
|
||||
|
||||
}} // namespace
|
||||
|
@ -91,8 +91,8 @@ TEST(Calib3d_EstimateTranslation3D, testNPoints)
|
||||
<< "aff est: " << trans_est << endl
|
||||
<< "aff ref: " << trans;
|
||||
|
||||
bool outl_good = count(outl.begin(), outl.end(), 1) == m &&
|
||||
m == accumulate(outl.begin(), outl.begin() + m, 0);
|
||||
bool outl_good = std::count(outl.begin(), outl.end(), 1) == m &&
|
||||
m == std::accumulate(outl.begin(), outl.begin() + m, 0);
|
||||
|
||||
EXPECT_TRUE(outl_good);
|
||||
}
|
||||
|
@ -60,6 +60,26 @@ if(CV_TRACE AND HAVE_ITT)
|
||||
add_definitions(-DOPENCV_WITH_ITT=1)
|
||||
endif()
|
||||
|
||||
# https://github.com/opencv/opencv/issues/24145
|
||||
if(HAVE_IPP)
|
||||
OCV_OPTION(OPENCV_IPP_ENABLE_ALL "Enable all OPENCV_IPP_ options at once" OFF)
|
||||
OCV_OPTION(OPENCV_IPP_MEAN "Enable IPP optimizations for mean (+200Kb in binary size)" OPENCV_IPP_ENABLE_ALL)
|
||||
OCV_OPTION(OPENCV_IPP_MINMAX "Enable IPP optimizations for minMaxLoc/minMaxIdx (+200Kb in binary size)" OPENCV_IPP_ENABLE_ALL)
|
||||
OCV_OPTION(OPENCV_IPP_SUM "Enable IPP optimizations for sum (+100Kb in binary size)" OPENCV_IPP_ENABLE_ALL)
|
||||
|
||||
if(OPENCV_IPP_MEAN)
|
||||
ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/mean.dispatch.cpp "OPENCV_IPP_MEAN=1")
|
||||
endif()
|
||||
|
||||
if(OPENCV_IPP_MINMAX)
|
||||
ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/minmax.cpp "OPENCV_IPP_MINMAX=1")
|
||||
endif()
|
||||
|
||||
if(OPENCV_IPP_SUM)
|
||||
ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/sum.dispatch.cpp "OPENCV_IPP_SUM=1")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
file(GLOB lib_cuda_hdrs
|
||||
"${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/cuda/*.hpp"
|
||||
"${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/cuda/*.h")
|
||||
|
@ -1118,6 +1118,13 @@ CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode);
|
||||
*/
|
||||
CV_EXPORTS_W void flipND(InputArray src, OutputArray dst, int axis);
|
||||
|
||||
/** @brief Broadcast the given Mat to the given shape.
|
||||
* @param src input array
|
||||
* @param shape target shape. Should be a list of CV_32S numbers. Note that negative values are not supported.
|
||||
* @param dst output array that has the given shape
|
||||
*/
|
||||
CV_EXPORTS_W void broadcast(InputArray src, InputArray shape, OutputArray dst);
|
||||
|
||||
enum RotateFlags {
|
||||
ROTATE_90_CLOCKWISE = 0, //!<Rotate 90 degrees clockwise
|
||||
ROTATE_180 = 1, //!<Rotate 180 degrees clockwise
|
||||
|
@ -198,16 +198,32 @@ public:
|
||||
CV_WRAP GpuMat clone() const;
|
||||
|
||||
//! copies the GpuMat content to device memory (Blocking call)
|
||||
CV_WRAP void copyTo(OutputArray dst) const;
|
||||
void copyTo(OutputArray dst) const;
|
||||
//! bindings overload which copies the GpuMat content to device memory (Blocking call)
|
||||
CV_WRAP void copyTo(CV_OUT GpuMat& dst) const {
|
||||
copyTo(static_cast<OutputArray>(dst));
|
||||
}
|
||||
|
||||
//! copies the GpuMat content to device memory (Non-Blocking call)
|
||||
CV_WRAP void copyTo(OutputArray dst, Stream& stream) const;
|
||||
void copyTo(OutputArray dst, Stream& stream) const;
|
||||
//! bindings overload which copies the GpuMat content to device memory (Non-Blocking call)
|
||||
CV_WRAP void copyTo(CV_OUT GpuMat& dst, Stream& stream) const {
|
||||
copyTo(static_cast<OutputArray>(dst), stream);
|
||||
}
|
||||
|
||||
//! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
|
||||
CV_WRAP void copyTo(OutputArray dst, InputArray mask) const;
|
||||
void copyTo(OutputArray dst, InputArray mask) const;
|
||||
//! bindings overload which copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
|
||||
CV_WRAP void copyTo(CV_OUT GpuMat& dst, GpuMat& mask) const {
|
||||
copyTo(static_cast<OutputArray>(dst), static_cast<InputArray>(mask));
|
||||
}
|
||||
|
||||
//! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
|
||||
CV_WRAP void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
|
||||
void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
|
||||
//! bindings overload which copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
|
||||
CV_WRAP void copyTo(CV_OUT GpuMat& dst, GpuMat& mask, Stream& stream) const {
|
||||
copyTo(static_cast<OutputArray>(dst), static_cast<InputArray>(mask), stream);
|
||||
}
|
||||
|
||||
//! sets some of the GpuMat elements to s (Blocking call)
|
||||
CV_WRAP GpuMat& setTo(Scalar s);
|
||||
@ -222,19 +238,31 @@ public:
|
||||
CV_WRAP GpuMat& setTo(Scalar s, InputArray mask, Stream& stream);
|
||||
|
||||
//! converts GpuMat to another datatype (Blocking call)
|
||||
CV_WRAP void convertTo(OutputArray dst, int rtype) const;
|
||||
void convertTo(OutputArray dst, int rtype) const;
|
||||
|
||||
//! converts GpuMat to another datatype (Non-Blocking call)
|
||||
CV_WRAP void convertTo(OutputArray dst, int rtype, Stream& stream) const;
|
||||
void convertTo(OutputArray dst, int rtype, Stream& stream) const;
|
||||
//! bindings overload which converts GpuMat to another datatype (Non-Blocking call)
|
||||
CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, Stream& stream) const {
|
||||
convertTo(static_cast<OutputArray>(dst), rtype, stream);
|
||||
}
|
||||
|
||||
//! converts GpuMat to another datatype with scaling (Blocking call)
|
||||
CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
|
||||
void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
|
||||
//! bindings overload which converts GpuMat to another datatype with scaling(Blocking call)
|
||||
CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, double alpha = 1.0, double beta = 0.0) const {
|
||||
convertTo(static_cast<OutputArray>(dst), rtype, alpha, beta);
|
||||
}
|
||||
|
||||
//! converts GpuMat to another datatype with scaling (Non-Blocking call)
|
||||
CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
|
||||
void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
|
||||
|
||||
//! converts GpuMat to another datatype with scaling (Non-Blocking call)
|
||||
CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
|
||||
void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
|
||||
//! bindings overload which converts GpuMat to another datatype with scaling (Non-Blocking call)
|
||||
CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, double alpha, double beta, Stream& stream) const {
|
||||
convertTo(static_cast<OutputArray>(dst), rtype, alpha, beta, stream);
|
||||
}
|
||||
|
||||
CV_WRAP void assignTo(GpuMat& m, int type = -1) const;
|
||||
|
||||
|
@ -2014,12 +2014,12 @@ inline v_int32x4 v_trunc(const v_float32x4& a)
|
||||
inline v_int32x4 v_round(const v_float64x2& a)
|
||||
{
|
||||
static const int32x2_t zero = vdup_n_s32(0);
|
||||
return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
|
||||
return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), zero));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
|
||||
return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), vmovn_s64(vcvtnq_s64_f64(b.val))));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_floor(const v_float64x2& a)
|
||||
|
@ -924,6 +924,9 @@ inline scalartype v_reduce_sum(const _Tpvec& a) \
|
||||
return (scalartype)v_get0(res); \
|
||||
}
|
||||
OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits<v_float32>::vlanes())
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, float, f64, VTraits<v_float64>::vlanes())
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
|
||||
inline scalartype v_reduce_##func(const _Tpvec& a) \
|
||||
|
@ -3,6 +3,7 @@
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
#include <opencv2/core/ocl.hpp>
|
||||
|
@ -12,7 +12,8 @@
|
||||
# elif defined WINRT || defined _WIN32_WCE
|
||||
/* not supported */
|
||||
# elif defined __ANDROID__ || defined __linux__ || defined _WIN32 || \
|
||||
defined __FreeBSD__ || defined __bsdi__ || defined __HAIKU__
|
||||
defined __FreeBSD__ || defined __bsdi__ || defined __HAIKU__ || \
|
||||
defined __GNU__
|
||||
# define OPENCV_HAVE_FILESYSTEM_SUPPORT 1
|
||||
# elif defined(__APPLE__)
|
||||
# include <TargetConditionals.h>
|
||||
|
@ -5,8 +5,35 @@ namespace opencv_test
|
||||
{
|
||||
using namespace perf;
|
||||
|
||||
using BroadcastTest = perf::TestBaseWithParam<std::tuple<std::vector<int>, perf::MatType, std::vector<int>>>;
|
||||
typedef Size_MatType BinaryOpTest;
|
||||
|
||||
PERF_TEST_P_(BroadcastTest, basic)
|
||||
{
|
||||
std::vector<int> shape_src = get<0>(GetParam());
|
||||
int dt_type = get<1>(GetParam());
|
||||
std::vector<int> shape_dst = get<2>(GetParam());
|
||||
|
||||
cv::Mat src(static_cast<int>(shape_src.size()), shape_src.data(), dt_type);
|
||||
cv::Mat dst(static_cast<int>(shape_dst.size()), shape_dst.data(), dt_type);
|
||||
|
||||
cv::randu(src, -1.f, 1.f);
|
||||
|
||||
TEST_CYCLE() cv::broadcast(src, shape_dst, dst);
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(/*nothing*/ , BroadcastTest,
|
||||
testing::Combine(
|
||||
testing::Values(std::vector<int>{1, 100, 800},
|
||||
std::vector<int>{10, 1, 800},
|
||||
std::vector<int>{10, 100, 1}),
|
||||
testing::Values(CV_32FC1),
|
||||
testing::Values(std::vector<int>{10, 100, 800})
|
||||
)
|
||||
);
|
||||
|
||||
PERF_TEST_P_(BinaryOpTest, min)
|
||||
{
|
||||
Size sz = get<0>(GetParam());
|
||||
|
@ -1335,7 +1335,7 @@ struct InRange_SIMD
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
|
||||
template <>
|
||||
struct InRange_SIMD<uchar>
|
||||
@ -1344,7 +1344,7 @@ struct InRange_SIMD<uchar>
|
||||
uchar * dst, int len) const
|
||||
{
|
||||
int x = 0;
|
||||
const int width = v_uint8::nlanes;
|
||||
const int width = VTraits<v_uint8>::vlanes();
|
||||
|
||||
for (; x <= len - width; x += width)
|
||||
{
|
||||
@ -1352,7 +1352,7 @@ struct InRange_SIMD<uchar>
|
||||
v_uint8 low = vx_load(src2 + x);
|
||||
v_uint8 high = vx_load(src3 + x);
|
||||
|
||||
v_store(dst + x, (values >= low) & (high >= values));
|
||||
v_store(dst + x, v_and(v_ge(values, low), v_ge(high, values)));
|
||||
}
|
||||
vx_cleanup();
|
||||
return x;
|
||||
@ -1366,7 +1366,7 @@ struct InRange_SIMD<schar>
|
||||
uchar * dst, int len) const
|
||||
{
|
||||
int x = 0;
|
||||
const int width = v_int8::nlanes;
|
||||
const int width = VTraits<v_int8>::vlanes();
|
||||
|
||||
for (; x <= len - width; x += width)
|
||||
{
|
||||
@ -1374,7 +1374,7 @@ struct InRange_SIMD<schar>
|
||||
v_int8 low = vx_load(src2 + x);
|
||||
v_int8 high = vx_load(src3 + x);
|
||||
|
||||
v_store((schar*)(dst + x), (values >= low) & (high >= values));
|
||||
v_store((schar*)(dst + x), v_and(v_ge(values, low), v_ge(high, values)));
|
||||
}
|
||||
vx_cleanup();
|
||||
return x;
|
||||
@ -1388,7 +1388,7 @@ struct InRange_SIMD<ushort>
|
||||
uchar * dst, int len) const
|
||||
{
|
||||
int x = 0;
|
||||
const int width = v_uint16::nlanes * 2;
|
||||
const int width = VTraits<v_uint16>::vlanes() * 2;
|
||||
|
||||
for (; x <= len - width; x += width)
|
||||
{
|
||||
@ -1396,11 +1396,11 @@ struct InRange_SIMD<ushort>
|
||||
v_uint16 low1 = vx_load(src2 + x);
|
||||
v_uint16 high1 = vx_load(src3 + x);
|
||||
|
||||
v_uint16 values2 = vx_load(src1 + x + v_uint16::nlanes);
|
||||
v_uint16 low2 = vx_load(src2 + x + v_uint16::nlanes);
|
||||
v_uint16 high2 = vx_load(src3 + x + v_uint16::nlanes);
|
||||
v_uint16 values2 = vx_load(src1 + x + VTraits<v_uint16>::vlanes());
|
||||
v_uint16 low2 = vx_load(src2 + x + VTraits<v_uint16>::vlanes());
|
||||
v_uint16 high2 = vx_load(src3 + x + VTraits<v_uint16>::vlanes());
|
||||
|
||||
v_store(dst + x, v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)));
|
||||
v_store(dst + x, v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2))));
|
||||
}
|
||||
vx_cleanup();
|
||||
return x;
|
||||
@ -1414,7 +1414,7 @@ struct InRange_SIMD<short>
|
||||
uchar * dst, int len) const
|
||||
{
|
||||
int x = 0;
|
||||
const int width = (int)v_int16::nlanes * 2;
|
||||
const int width = (int)VTraits<v_int16>::vlanes() * 2;
|
||||
|
||||
for (; x <= len - width; x += width)
|
||||
{
|
||||
@ -1422,11 +1422,11 @@ struct InRange_SIMD<short>
|
||||
v_int16 low1 = vx_load(src2 + x);
|
||||
v_int16 high1 = vx_load(src3 + x);
|
||||
|
||||
v_int16 values2 = vx_load(src1 + x + v_int16::nlanes);
|
||||
v_int16 low2 = vx_load(src2 + x + v_int16::nlanes);
|
||||
v_int16 high2 = vx_load(src3 + x + v_int16::nlanes);
|
||||
v_int16 values2 = vx_load(src1 + x + VTraits<v_int16>::vlanes());
|
||||
v_int16 low2 = vx_load(src2 + x + VTraits<v_int16>::vlanes());
|
||||
v_int16 high2 = vx_load(src3 + x + VTraits<v_int16>::vlanes());
|
||||
|
||||
v_store((schar*)(dst + x), v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)));
|
||||
v_store((schar*)(dst + x), v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2))));
|
||||
}
|
||||
vx_cleanup();
|
||||
return x;
|
||||
@ -1440,7 +1440,7 @@ struct InRange_SIMD<int>
|
||||
uchar * dst, int len) const
|
||||
{
|
||||
int x = 0;
|
||||
const int width = (int)v_int32::nlanes * 2;
|
||||
const int width = (int)VTraits<v_int32>::vlanes() * 2;
|
||||
|
||||
for (; x <= len - width; x += width)
|
||||
{
|
||||
@ -1448,11 +1448,11 @@ struct InRange_SIMD<int>
|
||||
v_int32 low1 = vx_load(src2 + x);
|
||||
v_int32 high1 = vx_load(src3 + x);
|
||||
|
||||
v_int32 values2 = vx_load(src1 + x + v_int32::nlanes);
|
||||
v_int32 low2 = vx_load(src2 + x + v_int32::nlanes);
|
||||
v_int32 high2 = vx_load(src3 + x + v_int32::nlanes);
|
||||
v_int32 values2 = vx_load(src1 + x + VTraits<v_int32>::vlanes());
|
||||
v_int32 low2 = vx_load(src2 + x + VTraits<v_int32>::vlanes());
|
||||
v_int32 high2 = vx_load(src3 + x + VTraits<v_int32>::vlanes());
|
||||
|
||||
v_pack_store(dst + x, v_reinterpret_as_u16(v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2))));
|
||||
v_pack_store(dst + x, v_reinterpret_as_u16(v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2)))));
|
||||
}
|
||||
vx_cleanup();
|
||||
return x;
|
||||
@ -1466,7 +1466,7 @@ struct InRange_SIMD<float>
|
||||
uchar * dst, int len) const
|
||||
{
|
||||
int x = 0;
|
||||
const int width = (int)v_float32::nlanes * 2;
|
||||
const int width = (int)VTraits<v_float32>::vlanes() * 2;
|
||||
|
||||
for (; x <= len - width; x += width)
|
||||
{
|
||||
@ -1474,12 +1474,12 @@ struct InRange_SIMD<float>
|
||||
v_float32 low1 = vx_load(src2 + x);
|
||||
v_float32 high1 = vx_load(src3 + x);
|
||||
|
||||
v_float32 values2 = vx_load(src1 + x + v_float32::nlanes);
|
||||
v_float32 low2 = vx_load(src2 + x + v_float32::nlanes);
|
||||
v_float32 high2 = vx_load(src3 + x + v_float32::nlanes);
|
||||
v_float32 values2 = vx_load(src1 + x + VTraits<v_float32>::vlanes());
|
||||
v_float32 low2 = vx_load(src2 + x + VTraits<v_float32>::vlanes());
|
||||
v_float32 high2 = vx_load(src3 + x + VTraits<v_float32>::vlanes());
|
||||
|
||||
v_pack_store(dst + x, v_pack(v_reinterpret_as_u32(values1 >= low1) & v_reinterpret_as_u32(high1 >= values1),
|
||||
v_reinterpret_as_u32(values2 >= low2) & v_reinterpret_as_u32(high2 >= values2)));
|
||||
v_pack_store(dst + x, v_pack(v_and(v_reinterpret_as_u32(v_ge(values1, low1)), v_reinterpret_as_u32(v_ge(high1, values1))),
|
||||
v_and(v_reinterpret_as_u32(v_ge(values2, low2)), v_reinterpret_as_u32(v_ge(high2, values2)))));
|
||||
}
|
||||
vx_cleanup();
|
||||
return x;
|
||||
|
@ -215,7 +215,7 @@ template<typename T1, typename Tvec>
|
||||
struct op_add
|
||||
{
|
||||
static inline Tvec r(const Tvec& a, const Tvec& b)
|
||||
{ return a + b; }
|
||||
{ return v_add(a, b); }
|
||||
static inline T1 r(T1 a, T1 b)
|
||||
{ return c_add(a, b); }
|
||||
};
|
||||
@ -225,7 +225,7 @@ template<typename T1, typename Tvec>
|
||||
struct op_sub
|
||||
{
|
||||
static inline Tvec r(const Tvec& a, const Tvec& b)
|
||||
{ return a - b; }
|
||||
{ return v_sub(a, b); }
|
||||
static inline T1 r(T1 a, T1 b)
|
||||
{ return c_sub(a, b); }
|
||||
};
|
||||
@ -262,7 +262,7 @@ struct op_absdiff
|
||||
template<>
|
||||
struct op_absdiff<schar, v_int8>
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
static inline v_int8 r(const v_int8& a, const v_int8& b)
|
||||
{ return v_absdiffs(a, b); }
|
||||
#endif
|
||||
@ -272,7 +272,7 @@ struct op_absdiff<schar, v_int8>
|
||||
template<>
|
||||
struct op_absdiff<short, v_int16>
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
static inline v_int16 r(const v_int16& a, const v_int16& b)
|
||||
{ return v_absdiffs(a, b); }
|
||||
#endif
|
||||
@ -282,7 +282,7 @@ struct op_absdiff<short, v_int16>
|
||||
template<>
|
||||
struct op_absdiff<int, v_int32>
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
static inline v_int32 r(const v_int32& a, const v_int32& b)
|
||||
{ return v_reinterpret_as_s32(v_absdiff(a, b)); }
|
||||
#endif
|
||||
@ -295,7 +295,7 @@ template<typename T1, typename Tvec>
|
||||
struct op_or
|
||||
{
|
||||
static inline Tvec r(const Tvec& a, const Tvec& b)
|
||||
{ return a | b; }
|
||||
{ return v_or(a, b); }
|
||||
static inline T1 r(T1 a, T1 b)
|
||||
{ return a | b; }
|
||||
};
|
||||
@ -303,7 +303,7 @@ template<typename T1, typename Tvec>
|
||||
struct op_xor
|
||||
{
|
||||
static inline Tvec r(const Tvec& a, const Tvec& b)
|
||||
{ return a ^ b; }
|
||||
{ return v_xor(a, b); }
|
||||
static inline T1 r(T1 a, T1 b)
|
||||
{ return a ^ b; }
|
||||
};
|
||||
@ -311,7 +311,7 @@ template<typename T1, typename Tvec>
|
||||
struct op_and
|
||||
{
|
||||
static inline Tvec r(const Tvec& a, const Tvec& b)
|
||||
{ return a & b; }
|
||||
{ return v_and(a, b); }
|
||||
static inline T1 r(T1 a, T1 b)
|
||||
{ return a & b; }
|
||||
};
|
||||
@ -320,14 +320,14 @@ struct op_not
|
||||
{
|
||||
// ignored b from loader level
|
||||
static inline Tvec r(const Tvec& a)
|
||||
{ return ~a; }
|
||||
{ return v_not(a); }
|
||||
static inline T1 r(T1 a, T1)
|
||||
{ return ~a; }
|
||||
};
|
||||
|
||||
//////////////////////////// Loaders /////////////////////////////////
|
||||
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
|
||||
template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
|
||||
struct bin_loader
|
||||
@ -392,13 +392,13 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
|
||||
static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
|
||||
{
|
||||
typedef OP<T1, Tvec> op;
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
typedef bin_loader<OP, T1, Tvec> ldr;
|
||||
enum {wide_step = Tvec::nlanes};
|
||||
const int wide_step = VTraits<Tvec>::vlanes();
|
||||
#if !CV_NEON && CV_SIMD_WIDTH == 16
|
||||
enum {wide_step_l = wide_step * 2};
|
||||
const int wide_step_l = wide_step * 2;
|
||||
#else
|
||||
enum {wide_step_l = wide_step};
|
||||
const int wide_step_l = wide_step;
|
||||
#endif
|
||||
#endif // CV_SIMD
|
||||
|
||||
@ -410,7 +410,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
#if !CV_NEON && !CV_MSA
|
||||
if (is_aligned(src1, src2, dst))
|
||||
{
|
||||
@ -583,7 +583,7 @@ template<typename T1, typename Tvec>
|
||||
struct op_cmplt
|
||||
{
|
||||
static inline Tvec r(const Tvec& a, const Tvec& b)
|
||||
{ return a < b; }
|
||||
{ return v_lt(a, b); }
|
||||
static inline uchar r(T1 a, T1 b)
|
||||
{ return (uchar)-(int)(a < b); }
|
||||
};
|
||||
@ -592,7 +592,7 @@ template<typename T1, typename Tvec>
|
||||
struct op_cmple
|
||||
{
|
||||
static inline Tvec r(const Tvec& a, const Tvec& b)
|
||||
{ return a <= b; }
|
||||
{ return v_le(a, b); }
|
||||
static inline uchar r(T1 a, T1 b)
|
||||
{ return (uchar)-(int)(a <= b); }
|
||||
};
|
||||
@ -601,7 +601,7 @@ template<typename T1, typename Tvec>
|
||||
struct op_cmpeq
|
||||
{
|
||||
static inline Tvec r(const Tvec& a, const Tvec& b)
|
||||
{ return a == b; }
|
||||
{ return v_eq(a, b); }
|
||||
static inline uchar r(T1 a, T1 b)
|
||||
{ return (uchar)-(int)(a == b); }
|
||||
};
|
||||
@ -610,14 +610,14 @@ template<typename T1, typename Tvec>
|
||||
struct op_cmpne
|
||||
{
|
||||
static inline Tvec r(const Tvec& a, const Tvec& b)
|
||||
{ return a != b; }
|
||||
{ return v_ne(a, b); }
|
||||
static inline uchar r(T1 a, T1 b)
|
||||
{ return (uchar)-(int)(a != b); }
|
||||
};
|
||||
|
||||
//////////////////////////// Loaders /////////////////////////////////
|
||||
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
// todo: add support for RW alignment & stream
|
||||
template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
|
||||
struct cmp_loader_n
|
||||
@ -642,10 +642,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
|
||||
struct cmp_loader_n<sizeof(ushort), OP, T1, Tvec>
|
||||
{
|
||||
typedef OP<T1, Tvec> op;
|
||||
enum {step = Tvec::nlanes};
|
||||
|
||||
static inline void l(const T1* src1, const T1* src2, uchar* dst)
|
||||
{
|
||||
const int step = VTraits<Tvec>::vlanes();
|
||||
Tvec c0 = op::r(vx_load(src1), vx_load(src2));
|
||||
Tvec c1 = op::r(vx_load(src1 + step), vx_load(src2 + step));
|
||||
v_store(dst, v_pack_b(v_reinterpret_as_u16(c0), v_reinterpret_as_u16(c1)));
|
||||
@ -656,10 +656,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
|
||||
struct cmp_loader_n<sizeof(unsigned), OP, T1, Tvec>
|
||||
{
|
||||
typedef OP<T1, Tvec> op;
|
||||
enum {step = Tvec::nlanes};
|
||||
|
||||
static inline void l(const T1* src1, const T1* src2, uchar* dst)
|
||||
{
|
||||
const int step = VTraits<Tvec>::vlanes();
|
||||
v_uint32 c0 = v_reinterpret_as_u32(op::r(vx_load(src1), vx_load(src2)));
|
||||
v_uint32 c1 = v_reinterpret_as_u32(op::r(vx_load(src1 + step), vx_load(src2 + step)));
|
||||
v_uint32 c2 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
|
||||
@ -672,10 +672,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
|
||||
struct cmp_loader_n<sizeof(double), OP, T1, Tvec>
|
||||
{
|
||||
typedef OP<T1, Tvec> op;
|
||||
enum {step = Tvec::nlanes};
|
||||
|
||||
static inline void l(const T1* src1, const T1* src2, uchar* dst)
|
||||
{
|
||||
const int step = VTraits<Tvec>::vlanes();
|
||||
v_uint64 c0 = v_reinterpret_as_u64(op::r(vx_load(src1), vx_load(src2)));
|
||||
v_uint64 c1 = v_reinterpret_as_u64(op::r(vx_load(src1 + step), vx_load(src2 + step)));
|
||||
v_uint64 c2 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
|
||||
@ -697,9 +697,9 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
|
||||
static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
|
||||
{
|
||||
typedef OP<T1, Tvec> op;
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
|
||||
enum {wide_step = Tvec::nlanes * sizeof(T1)};
|
||||
const int wide_step = VTraits<Tvec>::vlanes() * sizeof(T1);
|
||||
#endif // CV_SIMD
|
||||
|
||||
step1 /= sizeof(T1);
|
||||
@ -709,7 +709,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
for (; x <= width - wide_step; x += wide_step)
|
||||
{
|
||||
ldr::l(src1 + x, src2 + x, dst + x);
|
||||
@ -876,7 +876,7 @@ DEFINE_SIMD_ALL(cmp)
|
||||
|
||||
//////////////////////////// Loaders ///////////////////////////////
|
||||
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
// todo: add support for RW alignment & stream
|
||||
template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
|
||||
struct scalar_loader_n
|
||||
@ -1009,10 +1009,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T2
|
||||
struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
|
||||
{
|
||||
typedef OP<int, T2, v_int32> op;
|
||||
enum {step = v_int32::nlanes};
|
||||
|
||||
static inline void l(const int* src1, const int* src2, const T2* scalar, int* dst)
|
||||
{
|
||||
const int step = VTraits<v_int32>::vlanes();
|
||||
v_int32 v_src1 = vx_load(src1);
|
||||
v_int32 v_src2 = vx_load(src2);
|
||||
v_int32 v_src1s = vx_load(src1 + step);
|
||||
@ -1039,6 +1039,7 @@ struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
|
||||
|
||||
static inline void l(const int* src1, const T2* scalar, int* dst)
|
||||
{
|
||||
const int step = VTraits<v_int32>::vlanes();
|
||||
v_int32 v_src1 = vx_load(src1);
|
||||
v_int32 v_src1s = vx_load(src1 + step);
|
||||
|
||||
@ -1064,10 +1065,9 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T2
|
||||
struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
|
||||
{
|
||||
typedef OP<float, T2, v_float32> op;
|
||||
enum {step = v_float32::nlanes};
|
||||
|
||||
static inline void l(const float* src1, const float* src2, const T2* scalar, float* dst)
|
||||
{
|
||||
const int step = VTraits<v_float32>::vlanes();
|
||||
v_float32 v_src1 = vx_load(src1);
|
||||
v_float32 v_src2 = vx_load(src2);
|
||||
v_float32 v_src1s = vx_load(src1 + step);
|
||||
@ -1082,6 +1082,7 @@ struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
|
||||
|
||||
static inline void l(const float* src1, const T2* scalar, float* dst)
|
||||
{
|
||||
const int step = VTraits<v_float32>::vlanes();
|
||||
v_float32 v_src1 = vx_load(src1);
|
||||
v_float32 v_src1s = vx_load(src1 + step);
|
||||
|
||||
@ -1258,10 +1259,10 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
|
||||
T1* dst, size_t step, int width, int height, const T2* scalar)
|
||||
{
|
||||
typedef OP<T1, T2, Tvec> op;
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
|
||||
const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
|
||||
sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
|
||||
const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
|
||||
sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
|
||||
#endif // CV_SIMD
|
||||
|
||||
step1 /= sizeof(T1);
|
||||
@ -1272,7 +1273,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
for (; x <= width - wide_step; x += wide_step)
|
||||
{
|
||||
ldr::l(src1 + x, src2 + x, scalar, dst + x);
|
||||
@ -1304,10 +1305,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1
|
||||
static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
|
||||
{
|
||||
typedef OP<T1, T2, Tvec> op;
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
|
||||
const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
|
||||
sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
|
||||
const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
|
||||
sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
|
||||
#endif // CV_SIMD
|
||||
|
||||
step1 /= sizeof(T1);
|
||||
@ -1317,7 +1318,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
for (; x <= width - wide_step; x += wide_step)
|
||||
{
|
||||
ldr::l(src1 + x, scalar, dst + x);
|
||||
@ -1424,7 +1425,7 @@ template<typename T1, typename Tvec>
|
||||
struct op_mul
|
||||
{
|
||||
static inline Tvec r(const Tvec& a, const Tvec& b)
|
||||
{ return a * b; }
|
||||
{ return v_mul(a, b); }
|
||||
static inline T1 r(T1 a, T1 b)
|
||||
{ return saturate_cast<T1>(a * b); }
|
||||
};
|
||||
@ -1432,11 +1433,11 @@ struct op_mul
|
||||
template<typename T1, typename T2, typename Tvec>
|
||||
struct op_mul_scale
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
|
||||
{
|
||||
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
||||
return v_scalar * a * b;
|
||||
return v_mul(v_scalar , a , b);
|
||||
}
|
||||
#endif
|
||||
static inline T1 r(T1 a, T1 b, const T2* scalar)
|
||||
@ -1452,7 +1453,7 @@ struct op_mul_scale<double, double, v_float64>
|
||||
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
|
||||
{
|
||||
const v_float64 v_scalar = vx_setall_f64(*scalar);
|
||||
return v_scalar * a * b;
|
||||
return v_mul(v_mul(v_scalar, a), b);
|
||||
}
|
||||
#endif
|
||||
static inline double r(double a, double b, const double* scalar)
|
||||
@ -1565,7 +1566,7 @@ template<typename T1, typename Tvec>
|
||||
struct op_div_f
|
||||
{
|
||||
static inline Tvec r(const Tvec& a, const Tvec& b)
|
||||
{ return a / b; }
|
||||
{ return v_div(a, b); }
|
||||
static inline T1 r(T1 a, T1 b)
|
||||
{ return a / b; }
|
||||
};
|
||||
@ -1573,16 +1574,16 @@ struct op_div_f
|
||||
template<typename T1, typename T2, typename Tvec>
|
||||
struct op_div_scale
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
|
||||
{
|
||||
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
||||
return a * v_scalar / b;
|
||||
return v_div(v_mul(a, v_scalar), b);
|
||||
}
|
||||
static inline Tvec pre(const Tvec& denom, const Tvec& res)
|
||||
{
|
||||
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
|
||||
return v_select(denom == v_zero, v_zero, res);
|
||||
const Tvec v_zero = vx_setall<typename VTraits<Tvec>::lane_type>(0);
|
||||
return v_select(v_eq(denom, v_zero), v_zero, res);
|
||||
}
|
||||
#endif
|
||||
static inline T1 r(T1 a, T1 denom, const T2* scalar)
|
||||
@ -1595,11 +1596,11 @@ struct op_div_scale
|
||||
template<>
|
||||
struct op_div_scale<float, float, v_float32>
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
|
||||
{
|
||||
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
||||
return a * v_scalar / b;
|
||||
return v_div(v_mul(a, v_scalar), b);
|
||||
}
|
||||
#endif
|
||||
static inline float r(float a, float denom, const float* scalar)
|
||||
@ -1613,7 +1614,7 @@ struct op_div_scale<double, double, v_float64>
|
||||
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
|
||||
{
|
||||
const v_float64 v_scalar = vx_setall_f64(*scalar);
|
||||
return a * v_scalar / b;
|
||||
return v_div(v_mul(a, v_scalar), b);
|
||||
}
|
||||
#endif
|
||||
static inline double r(double a, double denom, const double* scalar)
|
||||
@ -1681,7 +1682,7 @@ DEFINE_SIMD_ALL(div, div_loop)
|
||||
template<typename T1, typename T2, typename Tvec>
|
||||
struct op_add_scale
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
|
||||
{
|
||||
const v_float32 v_alpha = vx_setall_f32(*scalar);
|
||||
@ -1714,7 +1715,7 @@ struct op_add_scale<double, double, v_float64>
|
||||
template<typename T1, typename T2, typename Tvec>
|
||||
struct op_add_weighted
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
|
||||
{
|
||||
const v_float32 v_alpha = vx_setall_f32(scalars[0]);
|
||||
@ -1831,16 +1832,16 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
|
||||
template<typename T1, typename T2, typename Tvec>
|
||||
struct op_recip
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
static inline v_float32 r(const v_float32& a, const T2* scalar)
|
||||
{
|
||||
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
||||
return v_scalar / a;
|
||||
return v_div(v_scalar, a);
|
||||
}
|
||||
static inline Tvec pre(const Tvec& denom, const Tvec& res)
|
||||
{
|
||||
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
|
||||
return v_select(denom == v_zero, v_zero, res);
|
||||
const Tvec v_zero = vx_setall<typename VTraits<Tvec>::lane_type>(0);
|
||||
return v_select(v_eq(denom, v_zero), v_zero, res);
|
||||
}
|
||||
#endif
|
||||
static inline T1 r(T1 denom, const T2* scalar)
|
||||
@ -1853,11 +1854,11 @@ struct op_recip
|
||||
template<>
|
||||
struct op_recip<float, float, v_float32>
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
static inline v_float32 r(const v_float32& a, const float* scalar)
|
||||
{
|
||||
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
||||
return v_scalar / a;
|
||||
return v_div(v_scalar, a);
|
||||
}
|
||||
#endif
|
||||
static inline float r(float denom, const float* scalar)
|
||||
@ -1871,7 +1872,7 @@ struct op_recip<double, double, v_float64>
|
||||
static inline v_float64 r(const v_float64& a, const double* scalar)
|
||||
{
|
||||
const v_float64 v_scalar = vx_setall_f64(*scalar);
|
||||
return v_scalar / a;
|
||||
return v_div(v_scalar, a);
|
||||
}
|
||||
#endif
|
||||
static inline double r(double denom, const double* scalar)
|
||||
|
@ -4,6 +4,8 @@
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include "opencv2/core/check.hpp"
|
||||
|
||||
namespace cv {
|
||||
|
@ -11,7 +11,7 @@
|
||||
namespace cv
|
||||
{
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
|
||||
static inline void vx_load_as(const uchar* ptr, v_float32& a)
|
||||
{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); }
|
||||
@ -78,7 +78,7 @@ static inline void v_store_as(int64_t* ptr, const v_float32& a)
|
||||
v_int64 ia_0, ia_1;
|
||||
v_expand(ia, ia_0, ia_1);
|
||||
v_store(ptr, ia_0);
|
||||
v_store(ptr + v_int64::nlanes, ia_1);
|
||||
v_store(ptr + VTraits<v_uint64>::vlanes(), ia_1);
|
||||
}
|
||||
|
||||
static inline void v_store_as(uint64_t* ptr, const v_float32& a)
|
||||
@ -88,7 +88,7 @@ static inline void v_store_as(uint64_t* ptr, const v_float32& a)
|
||||
ia = v_max(ia, vx_setzero_s32());
|
||||
v_expand(v_reinterpret_as_u32(ia), ia_0, ia_1);
|
||||
v_store(ptr, ia_0);
|
||||
v_store(ptr + v_int64::nlanes, ia_1);
|
||||
v_store(ptr + VTraits<v_uint64>::vlanes(), ia_1);
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const uchar* ptr, v_uint16& a, v_uint16& b)
|
||||
@ -104,7 +104,7 @@ static inline void vx_load_pair_as(const schar* ptr, v_uint16& a, v_uint16& b)
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const ushort* ptr, v_uint16& a, v_uint16& b)
|
||||
{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
|
||||
{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_uint16>::vlanes()); }
|
||||
|
||||
static inline void vx_load_pair_as(const uchar* ptr, v_int16& a, v_int16& b)
|
||||
{
|
||||
@ -118,7 +118,7 @@ static inline void vx_load_pair_as(const schar* ptr, v_int16& a, v_int16& b)
|
||||
{ v_expand(vx_load(ptr), a, b); }
|
||||
|
||||
static inline void vx_load_pair_as(const short* ptr, v_int16& a, v_int16& b)
|
||||
{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
|
||||
{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_uint16>::vlanes()); }
|
||||
|
||||
static inline void vx_load_pair_as(const uchar* ptr, v_int32& a, v_int32& b)
|
||||
{
|
||||
@ -147,7 +147,7 @@ static inline void vx_load_pair_as(const short* ptr, v_int32& a, v_int32& b)
|
||||
static inline void vx_load_pair_as(const int* ptr, v_int32& a, v_int32& b)
|
||||
{
|
||||
a = vx_load(ptr);
|
||||
b = vx_load(ptr + v_int32::nlanes);
|
||||
b = vx_load(ptr + VTraits<v_int32>::vlanes());
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const uchar* ptr, v_float32& a, v_float32& b)
|
||||
@ -184,14 +184,14 @@ static inline void vx_load_pair_as(const short* ptr, v_float32& a, v_float32& b)
|
||||
|
||||
static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
|
||||
{
|
||||
v_int32 ia = vx_load(ptr), ib = vx_load(ptr + v_int32::nlanes);
|
||||
v_int32 ia = vx_load(ptr), ib = vx_load(ptr + VTraits<v_int32>::vlanes());
|
||||
a = v_cvt_f32(ia);
|
||||
b = v_cvt_f32(ib);
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const int64_t* ptr, v_int32& a, v_int32& b)
|
||||
{
|
||||
const int int64_nlanes = v_int64::nlanes;
|
||||
const int int64_nlanes = VTraits<v_uint64>::vlanes();
|
||||
a = v_pack(vx_load(ptr), vx_load(ptr + int64_nlanes));
|
||||
b = v_pack(vx_load(ptr + int64_nlanes*2), vx_load(ptr + int64_nlanes*3));
|
||||
}
|
||||
@ -199,7 +199,7 @@ static inline void vx_load_pair_as(const int64_t* ptr, v_int32& a, v_int32& b)
|
||||
static inline void vx_load_pair_as(const int64_t* ptr, v_uint64& a, v_uint64& b)
|
||||
{
|
||||
v_int64 z = vx_setzero_s64();
|
||||
v_int64 ia = vx_load(ptr), ib = vx_load(ptr + v_int64::nlanes);
|
||||
v_int64 ia = vx_load(ptr), ib = vx_load(ptr + VTraits<v_uint64>::vlanes());
|
||||
ia &= (ia > z);
|
||||
ib &= (ib > z);
|
||||
a = v_reinterpret_as_u64(ia);
|
||||
@ -208,7 +208,7 @@ static inline void vx_load_pair_as(const int64_t* ptr, v_uint64& a, v_uint64& b)
|
||||
|
||||
static inline void vx_load_pair_as(const int64_t* ptr, v_uint32& a, v_uint32& b)
|
||||
{
|
||||
const int nlanes = v_int64::nlanes;
|
||||
const int nlanes = VTraits<v_uint64>::vlanes();
|
||||
v_int64 z = vx_setzero_s64();
|
||||
v_int64 ia0 = vx_load(ptr), ia1 = vx_load(ptr + nlanes);
|
||||
v_int64 ib0 = vx_load(ptr + nlanes*2), ib1 = vx_load(ptr + nlanes*3);
|
||||
@ -222,8 +222,8 @@ static inline void vx_load_pair_as(const int64_t* ptr, v_uint32& a, v_uint32& b)
|
||||
|
||||
static inline void vx_load_pair_as(const uint64_t* ptr, v_float32& a, v_float32& b)
|
||||
{
|
||||
const int nlanes = v_uint64::nlanes;
|
||||
float buf[v_uint64::nlanes*4];
|
||||
const int nlanes = VTraits<v_uint64>::vlanes();
|
||||
float buf[VTraits<v_uint64>::max_nlanes*4];
|
||||
for (int i = 0; i < nlanes*4; i++) {
|
||||
buf[i] = (float)ptr[i];
|
||||
}
|
||||
@ -233,8 +233,8 @@ static inline void vx_load_pair_as(const uint64_t* ptr, v_float32& a, v_float32&
|
||||
|
||||
static inline void vx_load_pair_as(const int64_t* ptr, v_float32& a, v_float32& b)
|
||||
{
|
||||
const int nlanes = v_int64::nlanes;
|
||||
float buf[v_int64::nlanes*4];
|
||||
const int nlanes = VTraits<v_uint64>::vlanes();
|
||||
float buf[VTraits<v_uint64>::max_nlanes*4];
|
||||
for (int i = 0; i < nlanes*4; i++) {
|
||||
buf[i] = (float)ptr[i];
|
||||
}
|
||||
@ -277,21 +277,21 @@ static inline void vx_load_pair_as(const int* ptr, v_uint32& a, v_uint32& b)
|
||||
{
|
||||
v_int32 z = vx_setzero_s32();
|
||||
v_int32 ia = v_max(vx_load(ptr), z);
|
||||
v_int32 ib = v_max(vx_load(ptr + v_int32::nlanes), z);
|
||||
v_int32 ib = v_max(vx_load(ptr + VTraits<v_int32>::vlanes()), z);
|
||||
a = v_reinterpret_as_u32(ia);
|
||||
b = v_reinterpret_as_u32(ib);
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const uint64_t* ptr, v_uint32& a, v_uint32& b)
|
||||
{
|
||||
const int int64_nlanes = v_int64::nlanes;
|
||||
const int int64_nlanes = VTraits<v_uint64>::vlanes();
|
||||
a = v_pack(vx_load(ptr), vx_load(ptr + int64_nlanes));
|
||||
b = v_pack(vx_load(ptr + int64_nlanes*2), vx_load(ptr + int64_nlanes*3));
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const uint64_t* ptr, v_int32& a, v_int32& b)
|
||||
{
|
||||
const int int64_nlanes = v_int64::nlanes;
|
||||
const int int64_nlanes = VTraits<v_uint64>::vlanes();
|
||||
v_uint32 ua = v_pack(vx_load(ptr), vx_load(ptr + int64_nlanes));
|
||||
v_uint32 ub = v_pack(vx_load(ptr + int64_nlanes*2), vx_load(ptr + int64_nlanes*3));
|
||||
a = v_reinterpret_as_s32(ua);
|
||||
@ -299,37 +299,37 @@ static inline void vx_load_pair_as(const uint64_t* ptr, v_int32& a, v_int32& b)
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
|
||||
{ a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }
|
||||
{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_float32>::vlanes()); }
|
||||
|
||||
static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
|
||||
{
|
||||
a = vx_load_expand(ptr);
|
||||
b = vx_load_expand(ptr + v_float32::nlanes);
|
||||
b = vx_load_expand(ptr + VTraits<v_float32>::vlanes());
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const bfloat16_t* ptr, v_float32& a, v_float32& b)
|
||||
{
|
||||
a = vx_load_expand(ptr);
|
||||
b = vx_load_expand(ptr + v_float32::nlanes);
|
||||
b = vx_load_expand(ptr + VTraits<v_float32>::vlanes());
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const unsigned* ptr, v_uint32& a, v_uint32& b)
|
||||
{
|
||||
a = vx_load(ptr);
|
||||
b = vx_load(ptr + v_uint32::nlanes);
|
||||
b = vx_load(ptr + VTraits<v_uint32>::vlanes());
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const unsigned* ptr, v_int32& a, v_int32& b)
|
||||
{
|
||||
a = v_reinterpret_as_s32(vx_load(ptr));
|
||||
b = v_reinterpret_as_s32(vx_load(ptr + v_uint32::nlanes));
|
||||
b = v_reinterpret_as_s32(vx_load(ptr + VTraits<v_uint32>::vlanes()));
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const unsigned* ptr, v_float32& a, v_float32& b)
|
||||
{
|
||||
v_uint32 delta = vx_setall_u32(0x80000000U);
|
||||
v_uint32 ua = vx_load(ptr);
|
||||
v_uint32 ub = vx_load(ptr + v_uint32::nlanes);
|
||||
v_uint32 ub = vx_load(ptr + VTraits<v_uint32>::vlanes());
|
||||
v_uint32 mask_a = (ua >= delta) & delta, mask_b = (ub >= delta) & delta;
|
||||
v_float32 fmask_a = v_cvt_f32(v_reinterpret_as_s32(mask_a)); // 0.f or (float)(-(1 << 31))
|
||||
v_float32 fmask_b = v_cvt_f32(v_reinterpret_as_s32(mask_b)); // 0.f or (float)(-(1 << 31))
|
||||
@ -353,7 +353,7 @@ static inline void v_store_pair_as(schar* ptr, const v_uint16& a, const v_uint16
|
||||
}
|
||||
|
||||
static inline void v_store_pair_as(ushort* ptr, const v_uint16& a, const v_uint16& b)
|
||||
{ v_store(ptr, a); v_store(ptr + v_uint16::nlanes, b); }
|
||||
{ v_store(ptr, a); v_store(ptr + VTraits<v_uint16>::vlanes(), b); }
|
||||
|
||||
static inline void v_store_pair_as(uchar* ptr, const v_int16& a, const v_int16& b)
|
||||
{ v_store(ptr, v_pack_u(a, b)); }
|
||||
@ -362,7 +362,7 @@ static inline void v_store_pair_as(schar* ptr, const v_int16& a, const v_int16&
|
||||
{ v_store(ptr, v_pack(a, b)); }
|
||||
|
||||
static inline void v_store_pair_as(short* ptr, const v_int16& a, const v_int16& b)
|
||||
{ v_store(ptr, a); v_store(ptr + v_int16::nlanes, b); }
|
||||
{ v_store(ptr, a); v_store(ptr + VTraits<v_int16>::vlanes(), b); }
|
||||
|
||||
static inline void v_store_pair_as(uchar* ptr, const v_int32& a, const v_int32& b)
|
||||
{ v_pack_u_store(ptr, v_pack(a, b)); }
|
||||
@ -379,7 +379,7 @@ static inline void v_store_pair_as(short* ptr, const v_int32& a, const v_int32&
|
||||
static inline void v_store_pair_as(int* ptr, const v_int32& a, const v_int32& b)
|
||||
{
|
||||
v_store(ptr, a);
|
||||
v_store(ptr + v_int32::nlanes, b);
|
||||
v_store(ptr + VTraits<v_int32>::vlanes(), b);
|
||||
}
|
||||
|
||||
static inline void v_store_pair_as(int64_t* ptr, const v_int32& a, const v_int32& b)
|
||||
@ -387,7 +387,7 @@ static inline void v_store_pair_as(int64_t* ptr, const v_int32& a, const v_int32
|
||||
v_int64 q0, q1, q2, q3;
|
||||
v_expand(a, q0, q1);
|
||||
v_expand(b, q2, q3);
|
||||
const int nlanes = v_int64::nlanes;
|
||||
const int nlanes = VTraits<v_uint64>::vlanes();
|
||||
v_store(ptr, q0);
|
||||
v_store(ptr + nlanes, q1);
|
||||
v_store(ptr + nlanes*2, q2);
|
||||
@ -419,11 +419,11 @@ static inline void v_store_pair_as(int* ptr, const v_float32& a, const v_float32
|
||||
{
|
||||
v_int32 ia = v_round(a), ib = v_round(b);
|
||||
v_store(ptr, ia);
|
||||
v_store(ptr + v_int32::nlanes, ib);
|
||||
v_store(ptr + VTraits<v_int32>::vlanes(), ib);
|
||||
}
|
||||
|
||||
static inline void v_store_pair_as(float* ptr, const v_float32& a, const v_float32& b)
|
||||
{ v_store(ptr, a); v_store(ptr + v_float32::nlanes, b); }
|
||||
{ v_store(ptr, a); v_store(ptr + VTraits<v_float32>::vlanes(), b); }
|
||||
|
||||
static inline void v_store_pair_as(unsigned* ptr, const v_float32& a, const v_float32& b)
|
||||
{
|
||||
@ -431,7 +431,7 @@ static inline void v_store_pair_as(unsigned* ptr, const v_float32& a, const v_fl
|
||||
v_int32 ia = v_max(v_round(a), z);
|
||||
v_int32 ib = v_max(v_round(b), z);
|
||||
v_store(ptr, v_reinterpret_as_u32(ia));
|
||||
v_store(ptr + v_int32::nlanes, v_reinterpret_as_u32(ib));
|
||||
v_store(ptr + VTraits<v_int32>::vlanes(), v_reinterpret_as_u32(ib));
|
||||
}
|
||||
|
||||
static inline void v_store_pair_as(uchar* ptr, const v_uint32& a, const v_uint32& b)
|
||||
@ -447,7 +447,7 @@ static inline void v_store_pair_as(ushort* ptr, const v_uint32& a, const v_uint3
|
||||
static inline void v_store_pair_as(unsigned* ptr, const v_uint32& a, const v_uint32& b)
|
||||
{
|
||||
v_store(ptr, a);
|
||||
v_store(ptr + v_uint32::nlanes, b);
|
||||
v_store(ptr + VTraits<v_uint32>::vlanes(), b);
|
||||
}
|
||||
|
||||
static inline void v_store_pair_as(uint64_t* ptr, const v_uint32& a, const v_uint32& b)
|
||||
@ -455,7 +455,7 @@ static inline void v_store_pair_as(uint64_t* ptr, const v_uint32& a, const v_uin
|
||||
v_uint64 q0, q1, q2, q3;
|
||||
v_expand(a, q0, q1);
|
||||
v_expand(b, q2, q3);
|
||||
const int nlanes = v_uint64::nlanes;
|
||||
const int nlanes = VTraits<v_uint64>::vlanes();
|
||||
v_store(ptr, q0);
|
||||
v_store(ptr + nlanes, q1);
|
||||
v_store(ptr + nlanes*2, q2);
|
||||
@ -465,28 +465,28 @@ static inline void v_store_pair_as(uint64_t* ptr, const v_uint32& a, const v_uin
|
||||
static inline void v_store_pair_as(uint64_t* ptr, const v_uint64& a, const v_uint64& b)
|
||||
{
|
||||
v_store(ptr, a);
|
||||
v_store(ptr + v_uint64::nlanes, b);
|
||||
v_store(ptr + VTraits<v_uint64>::vlanes(), b);
|
||||
}
|
||||
|
||||
#if CV_SIMD_64F
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
|
||||
static inline void vx_load_as(const uint64_t* ptr, v_float32& a)
|
||||
{
|
||||
v_float64 a_0 = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr)));
|
||||
v_float64 a_1 = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr + v_uint64::nlanes)));
|
||||
v_float64 a_1 = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr + VTraits<v_uint64>::vlanes())));
|
||||
a = v_cvt_f32(a_0, a_1);
|
||||
}
|
||||
|
||||
static inline void vx_load_as(const int64_t* ptr, v_float32& a)
|
||||
{
|
||||
v_float64 a_0 = v_cvt_f64(vx_load(ptr));
|
||||
v_float64 a_1 = v_cvt_f64(vx_load(ptr + v_uint64::nlanes));
|
||||
v_float64 a_1 = v_cvt_f64(vx_load(ptr + VTraits<v_uint64>::vlanes()));
|
||||
a = v_cvt_f32(a_0, a_1);
|
||||
}
|
||||
|
||||
static inline void vx_load_as(const double* ptr, v_float32& a)
|
||||
{
|
||||
v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
|
||||
v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
|
||||
a = v_cvt_f32(v0, v1);
|
||||
}
|
||||
|
||||
@ -516,8 +516,8 @@ static inline void vx_load_pair_as(const bfloat16_t* ptr, v_float64& a, v_float6
|
||||
|
||||
static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
|
||||
{
|
||||
v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
|
||||
v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
|
||||
v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
|
||||
v_float64 v2 = vx_load(ptr + VTraits<v_float64>::vlanes()*2), v3 = vx_load(ptr + VTraits<v_float64>::vlanes()*3);
|
||||
v_int32 iv0 = v_round(v0), iv1 = v_round(v1);
|
||||
v_int32 iv2 = v_round(v2), iv3 = v_round(v3);
|
||||
a = v_combine_low(iv0, iv1);
|
||||
@ -526,15 +526,15 @@ static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
|
||||
|
||||
static inline void vx_load_pair_as(const uint64_t* ptr, v_float64& a, v_float64& b)
|
||||
{
|
||||
const int int64_nlanes = v_int64::nlanes;
|
||||
const int int64_nlanes = VTraits<v_uint64>::vlanes();
|
||||
a = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr)));
|
||||
b = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr + int64_nlanes)));
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const double* ptr, v_float32& a, v_float32& b)
|
||||
{
|
||||
v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
|
||||
v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
|
||||
v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
|
||||
v_float64 v2 = vx_load(ptr + VTraits<v_float64>::vlanes()*2), v3 = vx_load(ptr + VTraits<v_float64>::vlanes()*3);
|
||||
a = v_cvt_f32(v0, v1);
|
||||
b = v_cvt_f32(v2, v3);
|
||||
}
|
||||
@ -584,19 +584,19 @@ static inline void vx_load_pair_as(const float* ptr, v_float64& a, v_float64& b)
|
||||
static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b)
|
||||
{
|
||||
a = vx_load(ptr);
|
||||
b = vx_load(ptr + v_float64::nlanes);
|
||||
b = vx_load(ptr + VTraits<v_float64>::vlanes());
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const int64_t* ptr, v_float64& a, v_float64& b)
|
||||
{
|
||||
a = v_cvt_f64(vx_load(ptr));
|
||||
b = v_cvt_f64(vx_load(ptr + v_float64::nlanes));
|
||||
b = v_cvt_f64(vx_load(ptr + VTraits<v_float64>::vlanes()));
|
||||
}
|
||||
|
||||
static inline void vx_load_pair_as(const unsigned* ptr, v_float64& a, v_float64& b)
|
||||
{
|
||||
const int nlanes = v_uint64::nlanes;
|
||||
double buf[v_uint64::nlanes*2];
|
||||
const int nlanes = VTraits<v_uint64>::vlanes();
|
||||
double buf[VTraits<v_uint64>::max_nlanes*2];
|
||||
for (int i = 0; i < nlanes*2; i++)
|
||||
buf[i] = (double)ptr[i];
|
||||
a = vx_load(buf);
|
||||
@ -607,7 +607,7 @@ static inline void v_store_as(double* ptr, const v_float32& a)
|
||||
{
|
||||
v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
|
||||
v_store(ptr, fa0);
|
||||
v_store(ptr + v_float64::nlanes, fa1);
|
||||
v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
|
||||
}
|
||||
|
||||
static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32& b)
|
||||
@ -616,9 +616,9 @@ static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32&
|
||||
v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
|
||||
|
||||
v_store(ptr, fa0);
|
||||
v_store(ptr + v_float64::nlanes, fa1);
|
||||
v_store(ptr + v_float64::nlanes*2, fb0);
|
||||
v_store(ptr + v_float64::nlanes*3, fb1);
|
||||
v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
|
||||
v_store(ptr + VTraits<v_float64>::vlanes()*2, fb0);
|
||||
v_store(ptr + VTraits<v_float64>::vlanes()*3, fb1);
|
||||
}
|
||||
|
||||
static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_float32& b)
|
||||
@ -627,15 +627,15 @@ static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_floa
|
||||
v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
|
||||
|
||||
v_store(ptr, fa0);
|
||||
v_store(ptr + v_float64::nlanes, fa1);
|
||||
v_store(ptr + v_float64::nlanes*2, fb0);
|
||||
v_store(ptr + v_float64::nlanes*3, fb1);
|
||||
v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
|
||||
v_store(ptr + VTraits<v_float64>::vlanes()*2, fb0);
|
||||
v_store(ptr + VTraits<v_float64>::vlanes()*3, fb1);
|
||||
}
|
||||
|
||||
static inline void v_store_pair_as(double* ptr, const v_float64& a, const v_float64& b)
|
||||
{
|
||||
v_store(ptr, a);
|
||||
v_store(ptr + v_float64::nlanes, b);
|
||||
v_store(ptr + VTraits<v_float64>::vlanes(), b);
|
||||
}
|
||||
|
||||
static inline void v_store_pair_as(int* ptr, const v_float64& a, const v_float64& b)
|
||||
@ -662,7 +662,7 @@ static inline void v_store_pair_as(uint64_t* ptr, const v_float64& a, const v_fl
|
||||
v_int64 ia, ib;
|
||||
v_expand(v_round(v_max(a, z), v_max(b, z)), ia, ib);
|
||||
v_store(ptr, v_reinterpret_as_u64(ia));
|
||||
v_store(ptr + v_int64::nlanes, v_reinterpret_as_u64(ib));
|
||||
v_store(ptr + VTraits<v_uint64>::vlanes(), v_reinterpret_as_u64(ib));
|
||||
}
|
||||
|
||||
static inline void v_store_pair_as(int64_t* ptr, const v_float64& a, const v_float64& b)
|
||||
@ -670,7 +670,7 @@ static inline void v_store_pair_as(int64_t* ptr, const v_float64& a, const v_flo
|
||||
v_int64 ia, ib;
|
||||
v_expand(v_round(a, b), ia, ib);
|
||||
v_store(ptr, ia);
|
||||
v_store(ptr + v_int64::nlanes, ib);
|
||||
v_store(ptr + VTraits<v_uint64>::vlanes(), ib);
|
||||
}
|
||||
|
||||
static inline void v_store_pair_as(unsigned* ptr, const v_float64& a, const v_float64& b)
|
||||
@ -744,9 +744,9 @@ static inline void v_store_pair_as(double* ptr, const _Tsvec& a, const _Tsvec& b
|
||||
ptr[i] = (double)buf[i];
|
||||
}
|
||||
|
||||
#endif /////////// CV_SIMD_64F
|
||||
#endif /////////// CV_SIMD_64F || CV_SIMD_SCALABLE_64F
|
||||
|
||||
#endif /////////// CV_SIMD
|
||||
#endif /////////// CV_SIMD || CV_SIMD_SCALABLE
|
||||
|
||||
}
|
||||
|
||||
|
@ -41,8 +41,8 @@ void cvt16f32f( const float16_t* src, float* dst, int len )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
int j = 0;
|
||||
#if CV_SIMD
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int VECSZ = VTraits<v_float32>::vlanes();
|
||||
for( ; j < len; j += VECSZ )
|
||||
{
|
||||
if( j > len - VECSZ )
|
||||
@ -62,8 +62,8 @@ void cvt32f16f( const float* src, float16_t* dst, int len )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
int j = 0;
|
||||
#if CV_SIMD
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int VECSZ = VTraits<v_float32>::vlanes();
|
||||
for( ; j < len; j += VECSZ )
|
||||
{
|
||||
if( j > len - VECSZ )
|
||||
@ -83,8 +83,8 @@ void cvt32f16bf( const float* src, bfloat16_t* dst, int len )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
int j = 0;
|
||||
#if CV_SIMD
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int VECSZ = VTraits<v_float32>::vlanes();
|
||||
for( ; j < len; j += VECSZ )
|
||||
{
|
||||
if( j > len - VECSZ )
|
||||
@ -153,8 +153,8 @@ cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
|
||||
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
|
||||
{
|
||||
int j = 0;
|
||||
#if CV_SIMD
|
||||
const int VECSZ = _Twvec::nlanes*2;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int VECSZ = VTraits<_Twvec>::vlanes()*2;
|
||||
for( ; j < size.width; j += VECSZ )
|
||||
{
|
||||
if( j > size.width - VECSZ )
|
||||
@ -182,8 +182,8 @@ cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
|
||||
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
|
||||
{
|
||||
int j = 0;
|
||||
#if CV_SIMD_64F
|
||||
const int VECSZ = v_float64::nlanes*2;
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
const int VECSZ = VTraits<v_float64>::vlanes()*2;
|
||||
for( ; j < size.width; j += VECSZ )
|
||||
{
|
||||
if( j > size.width - VECSZ )
|
||||
@ -213,8 +213,8 @@ cvt1_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
|
||||
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
|
||||
{
|
||||
int j = 0;
|
||||
#if CV_SIMD
|
||||
const int VECSZ = _Twvec::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int VECSZ = VTraits<_Twvec>::vlanes();
|
||||
for( ; j < size.width; j += VECSZ )
|
||||
{
|
||||
if( j > size.width - VECSZ )
|
||||
|
@ -22,9 +22,9 @@ template<typename _Ts, typename _Td> inline void
|
||||
cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
|
||||
Size size, float a, float b )
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
|
||||
const int VECSZ = v_float32::nlanes*2;
|
||||
const int VECSZ = VTraits<v_float32>::vlanes()*2;
|
||||
#endif
|
||||
sstep /= sizeof(src[0]);
|
||||
dstep /= sizeof(dst[0]);
|
||||
@ -32,7 +32,7 @@ cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
|
||||
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
|
||||
{
|
||||
int j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for( ; j < size.width; j += VECSZ )
|
||||
{
|
||||
if( j > size.width - VECSZ )
|
||||
@ -72,9 +72,9 @@ template<typename _Ts, typename _Td> inline void
|
||||
cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
|
||||
Size size, float a, float b )
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
|
||||
const int VECSZ = v_float32::nlanes*2;
|
||||
const int VECSZ = VTraits<v_float32>::vlanes()*2;
|
||||
#endif
|
||||
sstep /= sizeof(src[0]);
|
||||
dstep /= sizeof(dst[0]);
|
||||
@ -82,7 +82,7 @@ cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
|
||||
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
|
||||
{
|
||||
int j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for( ; j < size.width; j += VECSZ )
|
||||
{
|
||||
if( j > size.width - VECSZ )
|
||||
@ -108,9 +108,9 @@ template<typename _Ts, typename _Td> inline void
|
||||
cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
|
||||
Size size, float a, float b )
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
|
||||
const int VECSZ = v_float32::nlanes;
|
||||
const int VECSZ = VTraits<v_float32>::vlanes();
|
||||
#endif
|
||||
sstep /= sizeof(src[0]);
|
||||
dstep /= sizeof(dst[0]);
|
||||
@ -118,7 +118,7 @@ cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
|
||||
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
|
||||
{
|
||||
int j = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for( ; j < size.width; j += VECSZ )
|
||||
{
|
||||
if( j > size.width - VECSZ )
|
||||
@ -143,9 +143,9 @@ template<typename _Ts, typename _Td> inline void
|
||||
cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
|
||||
Size size, double a, double b )
|
||||
{
|
||||
#if CV_SIMD_64F
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
v_float64 va = vx_setall_f64(a), vb = vx_setall_f64(b);
|
||||
const int VECSZ = v_float64::nlanes*2;
|
||||
const int VECSZ = VTraits<v_float64>::vlanes()*2;
|
||||
#endif
|
||||
sstep /= sizeof(src[0]);
|
||||
dstep /= sizeof(dst[0]);
|
||||
@ -153,7 +153,7 @@ cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
|
||||
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
|
||||
{
|
||||
int j = 0;
|
||||
#if CV_SIMD_64F
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
for( ; j < size.width; j += VECSZ )
|
||||
{
|
||||
if( j > size.width - VECSZ )
|
||||
|
@ -171,15 +171,15 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
|
||||
const uchar* src = (const uchar*)_src;
|
||||
uchar* dst = (uchar*)_dst;
|
||||
int x = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
{
|
||||
v_uint8 v_zero = vx_setzero_u8();
|
||||
|
||||
for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes )
|
||||
for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
|
||||
{
|
||||
v_uint8 v_src = vx_load(src + x),
|
||||
v_dst = vx_load(dst + x),
|
||||
v_nmask = vx_load(mask + x) == v_zero;
|
||||
v_nmask = v_eq(vx_load(mask + x), v_zero);
|
||||
|
||||
v_dst = v_select(v_nmask, v_dst, v_src);
|
||||
v_store(dst + x, v_dst);
|
||||
@ -203,23 +203,23 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
|
||||
const ushort* src = (const ushort*)_src;
|
||||
ushort* dst = (ushort*)_dst;
|
||||
int x = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
{
|
||||
v_uint8 v_zero = vx_setzero_u8();
|
||||
|
||||
for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes )
|
||||
for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
|
||||
{
|
||||
v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + v_uint16::nlanes),
|
||||
v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + v_uint16::nlanes);
|
||||
v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + VTraits<v_uint16>::vlanes()),
|
||||
v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + VTraits<v_uint16>::vlanes());
|
||||
|
||||
v_uint8 v_nmask1, v_nmask2;
|
||||
v_uint8 v_nmask = vx_load(mask + x) == v_zero;
|
||||
v_uint8 v_nmask = v_eq(vx_load(mask + x), v_zero);
|
||||
v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2);
|
||||
|
||||
v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1);
|
||||
v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2);
|
||||
v_store(dst + x, v_dst1);
|
||||
v_store(dst + x + v_uint16::nlanes, v_dst2);
|
||||
v_store(dst + x + VTraits<v_uint16>::vlanes(), v_dst2);
|
||||
}
|
||||
}
|
||||
vx_cleanup();
|
||||
|
@ -32,8 +32,8 @@ static int countNonZero_(const T* src, int len )
|
||||
static int countNonZero8u( const uchar* src, int len )
|
||||
{
|
||||
int i=0, nz = 0;
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_uint8::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int len0 = len & -VTraits<v_uint8>::vlanes();
|
||||
v_uint8 v_zero = vx_setzero_u8();
|
||||
v_uint8 v_one = vx_setall_u8(1);
|
||||
|
||||
@ -42,20 +42,20 @@ static int countNonZero8u( const uchar* src, int len )
|
||||
{
|
||||
v_uint16 v_sum16 = vx_setzero_u16();
|
||||
int j = i;
|
||||
while (j < std::min(len0, i + 65280 * v_uint16::nlanes))
|
||||
while (j < std::min(len0, i + 65280 * VTraits<v_uint16>::vlanes()))
|
||||
{
|
||||
v_uint8 v_sum8 = vx_setzero_u8();
|
||||
int k = j;
|
||||
for (; k < std::min(len0, j + 255 * v_uint8::nlanes); k += v_uint8::nlanes)
|
||||
v_sum8 += v_one & (vx_load(src + k) == v_zero);
|
||||
for (; k < std::min(len0, j + 255 * VTraits<v_uint8>::vlanes()); k += VTraits<v_uint8>::vlanes())
|
||||
v_sum8 = v_add(v_sum8, v_and(v_one, v_eq(vx_load(src + k), v_zero)));
|
||||
v_uint16 part1, part2;
|
||||
v_expand(v_sum8, part1, part2);
|
||||
v_sum16 += part1 + part2;
|
||||
v_sum16 = v_add(v_sum16, v_add(part1, part2));
|
||||
j = k;
|
||||
}
|
||||
v_uint32 part1, part2;
|
||||
v_expand(v_sum16, part1, part2);
|
||||
v_sum32 += part1 + part2;
|
||||
v_sum32 = v_add(v_sum32, v_add(part1, part2));
|
||||
i = j;
|
||||
}
|
||||
nz = i - v_reduce_sum(v_sum32);
|
||||
@ -69,8 +69,8 @@ static int countNonZero8u( const uchar* src, int len )
|
||||
static int countNonZero16u( const ushort* src, int len )
|
||||
{
|
||||
int i = 0, nz = 0;
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_int8::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int len0 = len & -VTraits<v_int8>::vlanes();
|
||||
v_uint16 v_zero = vx_setzero_u16();
|
||||
v_int8 v_one = vx_setall_s8(1);
|
||||
|
||||
@ -79,20 +79,20 @@ static int countNonZero16u( const ushort* src, int len )
|
||||
{
|
||||
v_int16 v_sum16 = vx_setzero_s16();
|
||||
int j = i;
|
||||
while (j < std::min(len0, i + 32766 * v_int16::nlanes))
|
||||
while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
|
||||
{
|
||||
v_int8 v_sum8 = vx_setzero_s8();
|
||||
int k = j;
|
||||
for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
|
||||
v_sum8 += v_one & v_pack(v_reinterpret_as_s16(vx_load(src + k) == v_zero), v_reinterpret_as_s16(vx_load(src + k + v_uint16::nlanes) == v_zero));
|
||||
for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
|
||||
v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_reinterpret_as_s16(v_eq(vx_load(src + k), v_zero)), v_reinterpret_as_s16(v_eq(vx_load(src + k + VTraits<v_uint16>::vlanes()), v_zero)))));
|
||||
v_int16 part1, part2;
|
||||
v_expand(v_sum8, part1, part2);
|
||||
v_sum16 += part1 + part2;
|
||||
v_sum16 = v_add(v_sum16, v_add(part1, part2));
|
||||
j = k;
|
||||
}
|
||||
v_int32 part1, part2;
|
||||
v_expand(v_sum16, part1, part2);
|
||||
v_sum32 += part1 + part2;
|
||||
v_sum32 = v_add(v_sum32, v_add(part1, part2));
|
||||
i = j;
|
||||
}
|
||||
nz = i - v_reduce_sum(v_sum32);
|
||||
@ -104,8 +104,8 @@ static int countNonZero16u( const ushort* src, int len )
|
||||
static int countNonZero32s( const int* src, int len )
|
||||
{
|
||||
int i = 0, nz = 0;
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_int8::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int len0 = len & -VTraits<v_int8>::vlanes();
|
||||
v_int32 v_zero = vx_setzero_s32();
|
||||
v_int8 v_one = vx_setall_s8(1);
|
||||
|
||||
@ -114,23 +114,20 @@ static int countNonZero32s( const int* src, int len )
|
||||
{
|
||||
v_int16 v_sum16 = vx_setzero_s16();
|
||||
int j = i;
|
||||
while (j < std::min(len0, i + 32766 * v_int16::nlanes))
|
||||
while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
|
||||
{
|
||||
v_int8 v_sum8 = vx_setzero_s8();
|
||||
int k = j;
|
||||
for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
|
||||
v_sum8 += v_one & v_pack(
|
||||
v_pack(vx_load(src + k ) == v_zero, vx_load(src + k + v_int32::nlanes) == v_zero),
|
||||
v_pack(vx_load(src + k + 2*v_int32::nlanes) == v_zero, vx_load(src + k + 3*v_int32::nlanes) == v_zero)
|
||||
);
|
||||
for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
|
||||
v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_pack(v_eq(vx_load(src + k), v_zero), v_eq(vx_load(src + k + VTraits<v_int32>::vlanes()), v_zero)), v_pack(v_eq(vx_load(src + k + 2 * VTraits<v_int32>::vlanes()), v_zero), v_eq(vx_load(src + k + 3 * VTraits<v_int32>::vlanes()), v_zero)))));
|
||||
v_int16 part1, part2;
|
||||
v_expand(v_sum8, part1, part2);
|
||||
v_sum16 += part1 + part2;
|
||||
v_sum16 = v_add(v_sum16, v_add(part1, part2));
|
||||
j = k;
|
||||
}
|
||||
v_int32 part1, part2;
|
||||
v_expand(v_sum16, part1, part2);
|
||||
v_sum32 += part1 + part2;
|
||||
v_sum32 = v_add(v_sum32, v_add(part1, part2));
|
||||
i = j;
|
||||
}
|
||||
nz = i - v_reduce_sum(v_sum32);
|
||||
@ -142,8 +139,8 @@ static int countNonZero32s( const int* src, int len )
|
||||
static int countNonZero32f( const float* src, int len )
|
||||
{
|
||||
int i = 0, nz = 0;
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_int8::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int len0 = len & -VTraits<v_int8>::vlanes();
|
||||
v_float32 v_zero = vx_setzero_f32();
|
||||
v_int8 v_one = vx_setall_s8(1);
|
||||
|
||||
@ -152,23 +149,20 @@ static int countNonZero32f( const float* src, int len )
|
||||
{
|
||||
v_int16 v_sum16 = vx_setzero_s16();
|
||||
int j = i;
|
||||
while (j < std::min(len0, i + 32766 * v_int16::nlanes))
|
||||
while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
|
||||
{
|
||||
v_int8 v_sum8 = vx_setzero_s8();
|
||||
int k = j;
|
||||
for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
|
||||
v_sum8 += v_one & v_pack(
|
||||
v_pack(v_reinterpret_as_s32(vx_load(src + k ) == v_zero), v_reinterpret_as_s32(vx_load(src + k + v_float32::nlanes) == v_zero)),
|
||||
v_pack(v_reinterpret_as_s32(vx_load(src + k + 2*v_float32::nlanes) == v_zero), v_reinterpret_as_s32(vx_load(src + k + 3*v_float32::nlanes) == v_zero))
|
||||
);
|
||||
for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
|
||||
v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_pack(v_reinterpret_as_s32(v_eq(vx_load(src + k), v_zero)), v_reinterpret_as_s32(v_eq(vx_load(src + k + VTraits<v_float32>::vlanes()), v_zero))), v_pack(v_reinterpret_as_s32(v_eq(vx_load(src + k + 2 * VTraits<v_float32>::vlanes()), v_zero)), v_reinterpret_as_s32(v_eq(vx_load(src + k + 3 * VTraits<v_float32>::vlanes()), v_zero))))));
|
||||
v_int16 part1, part2;
|
||||
v_expand(v_sum8, part1, part2);
|
||||
v_sum16 += part1 + part2;
|
||||
v_sum16 = v_add(v_sum16, v_add(part1, part2));
|
||||
j = k;
|
||||
}
|
||||
v_int32 part1, part2;
|
||||
v_expand(v_sum16, part1, part2);
|
||||
v_sum32 += part1 + part2;
|
||||
v_sum32 = v_add(v_sum32, v_add(part1, part2));
|
||||
i = j;
|
||||
}
|
||||
nz = i - v_reduce_sum(v_sum32);
|
||||
@ -180,21 +174,21 @@ static int countNonZero32f( const float* src, int len )
|
||||
static int countNonZero64f( const double* src, int len )
|
||||
{
|
||||
int nz = 0, i = 0;
|
||||
#if CV_SIMD_64F
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
v_int64 sum1 = vx_setzero_s64();
|
||||
v_int64 sum2 = vx_setzero_s64();
|
||||
v_float64 zero = vx_setzero_f64();
|
||||
int step = v_float64::nlanes * 2;
|
||||
int step = VTraits<v_float64>::vlanes() * 2;
|
||||
int len0 = len & -step;
|
||||
|
||||
for(i = 0; i < len0; i += step )
|
||||
{
|
||||
sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero);
|
||||
sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero);
|
||||
sum1 = v_add(sum1, v_reinterpret_as_s64(v_eq(vx_load(&src[i]), zero)));
|
||||
sum2 = v_add(sum2, v_reinterpret_as_s64(v_eq(vx_load(&src[i + step / 2]), zero)));
|
||||
}
|
||||
|
||||
// N.B the value is incremented by -1 (0xF...F) for each value
|
||||
nz = i + (int)v_reduce_sum(sum1 + sum2);
|
||||
nz = i + (int)v_reduce_sum(v_add(sum1, sum2));
|
||||
v_cleanup();
|
||||
#endif
|
||||
return nz + countNonZero_(src + i, len - i);
|
||||
|
@ -274,22 +274,21 @@ template<typename T> struct VBLAS
|
||||
{
|
||||
int dot(const T*, const T*, int, T*) const { return 0; }
|
||||
int givens(T*, T*, int, T, T) const { return 0; }
|
||||
int givensx(T*, T*, int, T, T, T*, T*) const { return 0; }
|
||||
};
|
||||
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE_64F
|
||||
template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
|
||||
{
|
||||
if( n < 2*v_float32::nlanes )
|
||||
if( n < 2*VTraits<v_float32>::vlanes() )
|
||||
return 0;
|
||||
int k = 0;
|
||||
v_float32 s0 = vx_setzero_f32();
|
||||
for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
|
||||
for( ; k <= n - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 a0 = vx_load(a + k);
|
||||
v_float32 b0 = vx_load(b + k);
|
||||
|
||||
s0 += a0 * b0;
|
||||
s0 = v_add(s0, v_mul(a0, b0));
|
||||
}
|
||||
*result = v_reduce_sum(s0);
|
||||
vx_cleanup();
|
||||
@ -299,16 +298,16 @@ template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, f
|
||||
|
||||
template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, float s) const
|
||||
{
|
||||
if( n < v_float32::nlanes)
|
||||
if( n < VTraits<v_float32>::vlanes())
|
||||
return 0;
|
||||
int k = 0;
|
||||
v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s);
|
||||
for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
|
||||
for( ; k <= n - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 a0 = vx_load(a + k);
|
||||
v_float32 b0 = vx_load(b + k);
|
||||
v_float32 t0 = (a0 * c4) + (b0 * s4);
|
||||
v_float32 t1 = (b0 * c4) - (a0 * s4);
|
||||
v_float32 t0 = v_add(v_mul(a0, c4), v_mul(b0, s4));
|
||||
v_float32 t1 = v_sub(v_mul(b0, c4), v_mul(a0, s4));
|
||||
v_store(a + k, t0);
|
||||
v_store(b + k, t1);
|
||||
}
|
||||
@ -317,44 +316,19 @@ template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, f
|
||||
}
|
||||
|
||||
|
||||
template<> inline int VBLAS<float>::givensx(float* a, float* b, int n, float c, float s,
|
||||
float* anorm, float* bnorm) const
|
||||
{
|
||||
if( n < v_float32::nlanes)
|
||||
return 0;
|
||||
int k = 0;
|
||||
v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s);
|
||||
v_float32 sa = vx_setzero_f32(), sb = vx_setzero_f32();
|
||||
for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
|
||||
{
|
||||
v_float32 a0 = vx_load(a + k);
|
||||
v_float32 b0 = vx_load(b + k);
|
||||
v_float32 t0 = (a0 * c4) + (b0 * s4);
|
||||
v_float32 t1 = (b0 * c4) - (a0 * s4);
|
||||
v_store(a + k, t0);
|
||||
v_store(b + k, t1);
|
||||
sa += t0 + t0;
|
||||
sb += t1 + t1;
|
||||
}
|
||||
*anorm = v_reduce_sum(sa);
|
||||
*bnorm = v_reduce_sum(sb);
|
||||
vx_cleanup();
|
||||
return k;
|
||||
}
|
||||
|
||||
#if CV_SIMD_64F
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n, double* result) const
|
||||
{
|
||||
if( n < 2*v_float64::nlanes )
|
||||
if( n < 2*VTraits<v_float64>::vlanes() )
|
||||
return 0;
|
||||
int k = 0;
|
||||
v_float64 s0 = vx_setzero_f64();
|
||||
for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
|
||||
for( ; k <= n - VTraits<v_float64>::vlanes(); k += VTraits<v_float64>::vlanes() )
|
||||
{
|
||||
v_float64 a0 = vx_load(a + k);
|
||||
v_float64 b0 = vx_load(b + k);
|
||||
|
||||
s0 += a0 * b0;
|
||||
s0 = v_add(s0, v_mul(a0, b0));
|
||||
}
|
||||
double sbuf[2];
|
||||
v_store(sbuf, s0);
|
||||
@ -368,12 +342,12 @@ template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double
|
||||
{
|
||||
int k = 0;
|
||||
v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s);
|
||||
for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
|
||||
for( ; k <= n - VTraits<v_float64>::vlanes(); k += VTraits<v_float64>::vlanes() )
|
||||
{
|
||||
v_float64 a0 = vx_load(a + k);
|
||||
v_float64 b0 = vx_load(b + k);
|
||||
v_float64 t0 = (a0 * c2) + (b0 * s2);
|
||||
v_float64 t1 = (b0 * c2) - (a0 * s2);
|
||||
v_float64 t0 = v_add(v_mul(a0, c2), v_mul(b0, s2));
|
||||
v_float64 t1 = v_sub(v_mul(b0, c2), v_mul(a0, s2));
|
||||
v_store(a + k, t0);
|
||||
v_store(b + k, t1);
|
||||
}
|
||||
@ -382,30 +356,6 @@ template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double
|
||||
}
|
||||
|
||||
|
||||
template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double c, double s,
|
||||
double* anorm, double* bnorm) const
|
||||
{
|
||||
int k = 0;
|
||||
v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s);
|
||||
v_float64 sa = vx_setzero_f64(), sb = vx_setzero_f64();
|
||||
for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
|
||||
{
|
||||
v_float64 a0 = vx_load(a + k);
|
||||
v_float64 b0 = vx_load(b + k);
|
||||
v_float64 t0 = (a0 * c2) + (b0 * s2);
|
||||
v_float64 t1 = (b0 * c2) - (a0 * s2);
|
||||
v_store(a + k, t0);
|
||||
v_store(b + k, t1);
|
||||
sa += t0 * t0;
|
||||
sb += t1 * t1;
|
||||
}
|
||||
double abuf[2], bbuf[2];
|
||||
v_store(abuf, sa);
|
||||
v_store(bbuf, sb);
|
||||
*anorm = abuf[0] + abuf[1];
|
||||
*bnorm = bbuf[0] + bbuf[1];
|
||||
return k;
|
||||
}
|
||||
#endif //CV_SIMD_64F
|
||||
#endif //CV_SIMD
|
||||
|
||||
@ -916,7 +866,7 @@ double invert( InputArray _src, OutputArray _dst, int method )
|
||||
#if CV_SIMD128
|
||||
const float d_32f = (float)d;
|
||||
const v_float32x4 d_vec(d_32f, -d_32f, -d_32f, d_32f);
|
||||
v_float32x4 s0 = v_load_halves((const float*)srcdata, (const float*)(srcdata + srcstep)) * d_vec;//0123//3120
|
||||
v_float32x4 s0 = v_mul(v_load_halves((const float *)srcdata, (const float *)(srcdata + srcstep)), d_vec);//0123//3120
|
||||
s0 = v_extract<3>(s0, v_combine_low(v_rotate_right<1>(s0), s0));
|
||||
v_store_low((float*)dstdata, s0);
|
||||
v_store_high((float*)(dstdata + dststep), s0);
|
||||
@ -942,10 +892,10 @@ double invert( InputArray _src, OutputArray _dst, int method )
|
||||
d = 1./d;
|
||||
#if CV_SIMD128_64F
|
||||
v_float64x2 det = v_setall_f64(d);
|
||||
v_float64x2 s0 = v_load((const double*)srcdata) * det;
|
||||
v_float64x2 s1 = v_load((const double*)(srcdata+srcstep)) * det;
|
||||
v_float64x2 s0 = v_mul(v_load((const double *)srcdata), det);
|
||||
v_float64x2 s1 = v_mul(v_load((const double *)(srcdata + srcstep)), det);
|
||||
v_float64x2 sm = v_extract<1>(s1, s0);//30
|
||||
v_float64x2 ss = v_setall<double>(0) - v_extract<1>(s0, s1);//12
|
||||
v_float64x2 ss = v_sub(v_setall<double>(0), v_extract<1>(s0, s1));//12
|
||||
v_store((double*)dstdata, v_combine_low(sm, ss));//31
|
||||
v_store((double*)(dstdata + dststep), v_combine_high(ss, sm));//20
|
||||
#else
|
||||
|
@ -614,13 +614,13 @@ void polarToCart( InputArray src1, InputArray src2,
|
||||
{
|
||||
k = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
int cWidth = v_float32::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int cWidth = VTraits<v_float32>::vlanes();
|
||||
for( ; k <= len - cWidth; k += cWidth )
|
||||
{
|
||||
v_float32 v_m = vx_load(mag + k);
|
||||
v_store(x + k, vx_load(x + k) * v_m);
|
||||
v_store(y + k, vx_load(y + k) * v_m);
|
||||
v_store(x + k, v_mul(vx_load(x + k), v_m));
|
||||
v_store(y + k, v_mul(vx_load(y + k), v_m));
|
||||
}
|
||||
vx_cleanup();
|
||||
#endif
|
||||
@ -741,7 +741,7 @@ struct iPow_SIMD
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
|
||||
template <>
|
||||
struct iPow_SIMD<uchar, int>
|
||||
@ -751,7 +751,7 @@ struct iPow_SIMD<uchar, int>
|
||||
int i = 0;
|
||||
v_uint32 v_1 = vx_setall_u32(1u);
|
||||
|
||||
for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes)
|
||||
for ( ; i <= len - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint32 v_a1 = v_1, v_a2 = v_1;
|
||||
v_uint16 v = vx_load_expand(src + i);
|
||||
@ -763,16 +763,16 @@ struct iPow_SIMD<uchar, int>
|
||||
{
|
||||
if (p & 1)
|
||||
{
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
}
|
||||
v_b1 *= v_b1;
|
||||
v_b2 *= v_b2;
|
||||
v_b1 = v_mul(v_b1, v_b1);
|
||||
v_b2 = v_mul(v_b2, v_b2);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
|
||||
v = v_pack(v_a1, v_a2);
|
||||
v_pack_store(dst + i, v);
|
||||
@ -791,7 +791,7 @@ struct iPow_SIMD<schar, int>
|
||||
int i = 0;
|
||||
v_int32 v_1 = vx_setall_s32(1);
|
||||
|
||||
for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes)
|
||||
for ( ; i <= len - VTraits<v_int16>::vlanes(); i += VTraits<v_int16>::vlanes())
|
||||
{
|
||||
v_int32 v_a1 = v_1, v_a2 = v_1;
|
||||
v_int16 v = vx_load_expand(src + i);
|
||||
@ -803,16 +803,16 @@ struct iPow_SIMD<schar, int>
|
||||
{
|
||||
if (p & 1)
|
||||
{
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
}
|
||||
v_b1 *= v_b1;
|
||||
v_b2 *= v_b2;
|
||||
v_b1 = v_mul(v_b1, v_b1);
|
||||
v_b2 = v_mul(v_b2, v_b2);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
|
||||
v = v_pack(v_a1, v_a2);
|
||||
v_pack_store(dst + i, v);
|
||||
@ -831,7 +831,7 @@ struct iPow_SIMD<ushort, int>
|
||||
int i = 0;
|
||||
v_uint32 v_1 = vx_setall_u32(1u);
|
||||
|
||||
for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes)
|
||||
for ( ; i <= len - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint32 v_a1 = v_1, v_a2 = v_1;
|
||||
v_uint16 v = vx_load(src + i);
|
||||
@ -843,16 +843,16 @@ struct iPow_SIMD<ushort, int>
|
||||
{
|
||||
if (p & 1)
|
||||
{
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
}
|
||||
v_b1 *= v_b1;
|
||||
v_b2 *= v_b2;
|
||||
v_b1 = v_mul(v_b1, v_b1);
|
||||
v_b2 = v_mul(v_b2, v_b2);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
|
||||
v = v_pack(v_a1, v_a2);
|
||||
v_store(dst + i, v);
|
||||
@ -871,7 +871,7 @@ struct iPow_SIMD<short, int>
|
||||
int i = 0;
|
||||
v_int32 v_1 = vx_setall_s32(1);
|
||||
|
||||
for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes)
|
||||
for ( ; i <= len - VTraits<v_int16>::vlanes(); i += VTraits<v_int16>::vlanes())
|
||||
{
|
||||
v_int32 v_a1 = v_1, v_a2 = v_1;
|
||||
v_int16 v = vx_load(src + i);
|
||||
@ -883,16 +883,16 @@ struct iPow_SIMD<short, int>
|
||||
{
|
||||
if (p & 1)
|
||||
{
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
}
|
||||
v_b1 *= v_b1;
|
||||
v_b2 *= v_b2;
|
||||
v_b1 = v_mul(v_b1, v_b1);
|
||||
v_b2 = v_mul(v_b2, v_b2);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
|
||||
v = v_pack(v_a1, v_a2);
|
||||
v_store(dst + i, v);
|
||||
@ -911,29 +911,29 @@ struct iPow_SIMD<int, int>
|
||||
int i = 0;
|
||||
v_int32 v_1 = vx_setall_s32(1);
|
||||
|
||||
for ( ; i <= len - v_int32::nlanes*2; i += v_int32::nlanes*2)
|
||||
for ( ; i <= len - VTraits<v_int32>::vlanes()*2; i += VTraits<v_int32>::vlanes()*2)
|
||||
{
|
||||
v_int32 v_a1 = v_1, v_a2 = v_1;
|
||||
v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_int32::nlanes);
|
||||
v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_int32>::vlanes());
|
||||
int p = power;
|
||||
|
||||
while( p > 1 )
|
||||
{
|
||||
if (p & 1)
|
||||
{
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
}
|
||||
v_b1 *= v_b1;
|
||||
v_b2 *= v_b2;
|
||||
v_b1 = v_mul(v_b1, v_b1);
|
||||
v_b2 = v_mul(v_b2, v_b2);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
|
||||
v_store(dst + i, v_a1);
|
||||
v_store(dst + i + v_int32::nlanes, v_a2);
|
||||
v_store(dst + i + VTraits<v_int32>::vlanes(), v_a2);
|
||||
}
|
||||
vx_cleanup();
|
||||
|
||||
@ -949,34 +949,34 @@ struct iPow_SIMD<float, float>
|
||||
int i = 0;
|
||||
v_float32 v_1 = vx_setall_f32(1.f);
|
||||
|
||||
for ( ; i <= len - v_float32::nlanes*2; i += v_float32::nlanes*2)
|
||||
for ( ; i <= len - VTraits<v_float32>::vlanes()*2; i += VTraits<v_float32>::vlanes()*2)
|
||||
{
|
||||
v_float32 v_a1 = v_1, v_a2 = v_1;
|
||||
v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float32::nlanes);
|
||||
v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_float32>::vlanes());
|
||||
int p = std::abs(power);
|
||||
if( power < 0 )
|
||||
{
|
||||
v_b1 = v_1 / v_b1;
|
||||
v_b2 = v_1 / v_b2;
|
||||
v_b1 = v_div(v_1, v_b1);
|
||||
v_b2 = v_div(v_1, v_b2);
|
||||
}
|
||||
|
||||
while( p > 1 )
|
||||
{
|
||||
if (p & 1)
|
||||
{
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
}
|
||||
v_b1 *= v_b1;
|
||||
v_b2 *= v_b2;
|
||||
v_b1 = v_mul(v_b1, v_b1);
|
||||
v_b2 = v_mul(v_b2, v_b2);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
|
||||
v_store(dst + i, v_a1);
|
||||
v_store(dst + i + v_float32::nlanes, v_a2);
|
||||
v_store(dst + i + VTraits<v_float32>::vlanes(), v_a2);
|
||||
}
|
||||
vx_cleanup();
|
||||
|
||||
@ -984,7 +984,7 @@ struct iPow_SIMD<float, float>
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_SIMD_64F
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
template <>
|
||||
struct iPow_SIMD<double, double>
|
||||
{
|
||||
@ -993,34 +993,34 @@ struct iPow_SIMD<double, double>
|
||||
int i = 0;
|
||||
v_float64 v_1 = vx_setall_f64(1.);
|
||||
|
||||
for ( ; i <= len - v_float64::nlanes*2; i += v_float64::nlanes*2)
|
||||
for ( ; i <= len - VTraits<v_float64>::vlanes()*2; i += VTraits<v_float64>::vlanes()*2)
|
||||
{
|
||||
v_float64 v_a1 = v_1, v_a2 = v_1;
|
||||
v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float64::nlanes);
|
||||
v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_float64>::vlanes());
|
||||
int p = std::abs(power);
|
||||
if( power < 0 )
|
||||
{
|
||||
v_b1 = v_1 / v_b1;
|
||||
v_b2 = v_1 / v_b2;
|
||||
v_b1 = v_div(v_1, v_b1);
|
||||
v_b2 = v_div(v_1, v_b2);
|
||||
}
|
||||
|
||||
while( p > 1 )
|
||||
{
|
||||
if (p & 1)
|
||||
{
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
}
|
||||
v_b1 *= v_b1;
|
||||
v_b2 *= v_b2;
|
||||
v_b1 = v_mul(v_b1, v_b1);
|
||||
v_b2 = v_mul(v_b2, v_b2);
|
||||
p >>= 1;
|
||||
}
|
||||
|
||||
v_a1 *= v_b1;
|
||||
v_a2 *= v_b2;
|
||||
v_a1 = v_mul(v_a1, v_b1);
|
||||
v_a2 = v_mul(v_a2, v_b2);
|
||||
|
||||
v_store(dst + i, v_a1);
|
||||
v_store(dst + i + v_float64::nlanes, v_a2);
|
||||
v_store(dst + i + VTraits<v_float64>::vlanes(), v_a2);
|
||||
}
|
||||
vx_cleanup();
|
||||
|
||||
@ -1614,7 +1614,7 @@ void patchNaNs( InputOutputArray _a, double _val )
|
||||
Cv32suf val;
|
||||
val.f = (float)_val;
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_int32 v_mask1 = vx_setall_s32(0x7fffffff), v_mask2 = vx_setall_s32(0x7f800000);
|
||||
v_int32 v_val = vx_setall_s32(val.i);
|
||||
#endif
|
||||
@ -1624,12 +1624,12 @@ void patchNaNs( InputOutputArray _a, double _val )
|
||||
int* tptr = ptrs[0];
|
||||
size_t j = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
size_t cWidth = (size_t)v_int32::nlanes;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
size_t cWidth = (size_t)VTraits<v_int32>::vlanes();
|
||||
for ( ; j + cWidth <= len; j += cWidth)
|
||||
{
|
||||
v_int32 v_src = vx_load(tptr + j);
|
||||
v_int32 v_cmp_mask = v_mask2 < (v_src & v_mask1);
|
||||
v_int32 v_cmp_mask = v_lt(v_mask2, v_and(v_src, v_mask1));
|
||||
v_int32 v_dst = v_select(v_cmp_mask, v_val, v_src);
|
||||
v_store(tptr + j, v_dst);
|
||||
}
|
||||
|
@ -1454,7 +1454,7 @@ transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
|
||||
static void
|
||||
transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn )
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int BITS = 10, SCALE = 1 << BITS;
|
||||
const float MAX_M = (float)(1 << (15 - BITS));
|
||||
|
||||
@ -1485,7 +1485,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
|
||||
v_int32 m10 = vx_setall_s32(m32[4]);
|
||||
v_int32 m11 = vx_setall_s32(m32[5]);
|
||||
int x = 0;
|
||||
for (; x <= (len - v_uint8::nlanes) * nChannels; x += v_uint8::nlanes * nChannels)
|
||||
for (; x <= (len - VTraits<v_uint8>::vlanes()) * nChannels; x += VTraits<v_uint8>::vlanes() * nChannels)
|
||||
{
|
||||
v_uint8 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
@ -1499,20 +1499,20 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
|
||||
v_int32 p1, p3;
|
||||
v_expand(bgl, p0, p2);
|
||||
v_expand(v_reinterpret_as_s16(rl), p1, p3);
|
||||
dbl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 * m2 + m3,
|
||||
v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 * m2 + m3);
|
||||
dgl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 * m6 + m7,
|
||||
v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 * m6 + m7);
|
||||
drl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11,
|
||||
v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11);
|
||||
dbl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m01), v_mul(p1, m2)), m3),
|
||||
v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m01), v_mul(p3, m2)), m3));
|
||||
dgl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m45), v_mul(p1, m6)), m7),
|
||||
v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m45), v_mul(p3, m6)), m7));
|
||||
drl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m89), v_mul(p1, m10)), m11),
|
||||
v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m89), v_mul(p3, m10)), m11));
|
||||
v_expand(bgh, p0, p2);
|
||||
v_expand(v_reinterpret_as_s16(rh), p1, p3);
|
||||
dbh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 * m2 + m3,
|
||||
v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 * m2 + m3);
|
||||
dgh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 * m6 + m7,
|
||||
v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 * m6 + m7);
|
||||
drh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11,
|
||||
v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11);
|
||||
dbh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m01), v_mul(p1, m2)), m3),
|
||||
v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m01), v_mul(p3, m2)), m3));
|
||||
dgh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m45), v_mul(p1, m6)), m7),
|
||||
v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m45), v_mul(p3, m6)), m7));
|
||||
drh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m89), v_mul(p1, m10)), m11),
|
||||
v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m89), v_mul(p3, m10)), m11));
|
||||
v_store_interleave(dst + x, v_pack_u(dbl, dbh), v_pack_u(dgl, dgh), v_pack_u(drl, drh));
|
||||
}
|
||||
m32[1] = saturate_cast<int>((m[3] + 0.5f)*SCALE);
|
||||
@ -1537,7 +1537,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
|
||||
static void
|
||||
transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn )
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if( scn == 3 && dcn == 3 )
|
||||
{
|
||||
int x = 0;
|
||||
@ -1555,7 +1555,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
|
||||
v_float32 m10 = vx_setall_f32(m[10]);
|
||||
v_float32 m11 = vx_setall_f32(m[11] - 32768.f);
|
||||
v_int16 delta = vx_setall_s16(-32768);
|
||||
for (; x <= (len - v_uint16::nlanes)*3; x += v_uint16::nlanes*3)
|
||||
for (; x <= (len - VTraits<v_uint16>::vlanes())*3; x += VTraits<v_uint16>::vlanes()*3)
|
||||
{
|
||||
v_uint16 b, g, r;
|
||||
v_load_deinterleave(src + x, b, g, r);
|
||||
@ -1574,6 +1574,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
|
||||
v_store_interleave(dst + x, v_reinterpret_as_u16(db), v_reinterpret_as_u16(dg), v_reinterpret_as_u16(dr));
|
||||
}
|
||||
#endif
|
||||
#if CV_SIMD128
|
||||
v_float32x4 _m0l(m[0], m[4], m[ 8], 0.f);
|
||||
v_float32x4 _m1l(m[1], m[5], m[ 9], 0.f);
|
||||
v_float32x4 _m2l(m[2], m[6], m[10], 0.f);
|
||||
@ -1587,6 +1588,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
|
||||
v_store(dst + x, v_rotate_right<1>(v_reinterpret_as_u16(v_add_wrap(v_pack(
|
||||
v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x ))), _m0h, _m1h, _m2h, _m3h)),
|
||||
v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x + 3))), _m0l, _m1l, _m2l, _m3l))), _delta))));
|
||||
#endif //CV_SIMD128
|
||||
for( ; x < len * 3; x += 3 )
|
||||
{
|
||||
float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2];
|
||||
@ -1606,25 +1608,25 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
|
||||
static void
|
||||
transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn )
|
||||
{
|
||||
#if CV_SIMD && !defined(__aarch64__) && !defined(_M_ARM64)
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE) && !defined(__aarch64__) && !defined(_M_ARM64)
|
||||
int x = 0;
|
||||
if( scn == 3 && dcn == 3 )
|
||||
{
|
||||
int idx[v_float32::nlanes/2];
|
||||
for( int i = 0; i < v_float32::nlanes/4; i++ )
|
||||
int idx[VTraits<v_float32>::max_nlanes/2];
|
||||
for( int i = 0; i < VTraits<v_float32>::vlanes()/4; i++ )
|
||||
{
|
||||
idx[i] = 3*i;
|
||||
idx[i + v_float32::nlanes/4] = 0;
|
||||
idx[i + VTraits<v_float32>::vlanes()/4] = 0;
|
||||
}
|
||||
float _m[] = { m[0], m[4], m[ 8], 0.f,
|
||||
m[1], m[5], m[ 9], 0.f,
|
||||
m[2], m[6], m[10], 0.f,
|
||||
m[3], m[7], m[11], 0.f };
|
||||
v_float32 m0 = vx_lut_quads(_m , idx + v_float32::nlanes/4);
|
||||
v_float32 m1 = vx_lut_quads(_m + 4, idx + v_float32::nlanes/4);
|
||||
v_float32 m2 = vx_lut_quads(_m + 8, idx + v_float32::nlanes/4);
|
||||
v_float32 m3 = vx_lut_quads(_m + 12, idx + v_float32::nlanes/4);
|
||||
for( ; x <= len*3 - v_float32::nlanes; x += 3*v_float32::nlanes/4 )
|
||||
v_float32 m0 = vx_lut_quads(_m , idx + VTraits<v_float32>::vlanes()/4);
|
||||
v_float32 m1 = vx_lut_quads(_m + 4, idx + VTraits<v_float32>::vlanes()/4);
|
||||
v_float32 m2 = vx_lut_quads(_m + 8, idx + VTraits<v_float32>::vlanes()/4);
|
||||
v_float32 m3 = vx_lut_quads(_m + 12, idx + VTraits<v_float32>::vlanes()/4);
|
||||
for( ; x <= len*3 - VTraits<v_float32>::vlanes(); x += 3*VTraits<v_float32>::vlanes()/4 )
|
||||
v_store(dst + x, v_pack_triplets(v_matmuladd(vx_lut_quads(src + x, idx), m0, m1, m2, m3)));
|
||||
for( ; x < len*3; x += 3 )
|
||||
{
|
||||
@ -1641,8 +1643,8 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
|
||||
if( scn == 4 && dcn == 4 )
|
||||
{
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
int idx[v_float32::nlanes/4];
|
||||
for( int i = 0; i < v_float32::nlanes/4; i++ )
|
||||
int idx[VTraits<v_float32>::max_nlanes/4];
|
||||
for( int i = 0; i < VTraits<v_float32>::vlanes()/4; i++ )
|
||||
idx[i] = 0;
|
||||
float _m[] = { m[4], m[9], m[14], m[19] };
|
||||
v_float32 m0 = vx_lut_quads(m , idx);
|
||||
@ -1650,12 +1652,13 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
|
||||
v_float32 m2 = vx_lut_quads(m+10, idx);
|
||||
v_float32 m3 = vx_lut_quads(m+15, idx);
|
||||
v_float32 m4 = vx_lut_quads(_m, idx);
|
||||
for( ; x <= len*4 - v_float32::nlanes; x += v_float32::nlanes )
|
||||
for( ; x <= len*4 - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes() )
|
||||
{
|
||||
v_float32 v_src = vx_load(src + x);
|
||||
v_store(dst + x, v_reduce_sum4(v_src * m0, v_src * m1, v_src * m2, v_src * m3) + m4);
|
||||
v_store(dst + x, v_add(v_reduce_sum4(v_mul(v_src, m0), v_mul(v_src, m1), v_mul(v_src, m2), v_mul(v_src, m3)), m4));
|
||||
}
|
||||
#endif
|
||||
#if CV_SIMD128
|
||||
v_float32x4 _m0 = v_load(m );
|
||||
v_float32x4 _m1 = v_load(m + 5);
|
||||
v_float32x4 _m2 = v_load(m + 10);
|
||||
@ -1666,6 +1669,17 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
|
||||
v_float32x4 v_src = v_load(src + x);
|
||||
v_store(dst + x, v_reduce_sum4(v_src * _m0, v_src * _m1, v_src * _m2, v_src * _m3) + _m4);
|
||||
}
|
||||
#else // CV_SIMD_WIDTH >= 16 && !CV_SIMD128
|
||||
for( ; x < len*4; x += 4 )
|
||||
{
|
||||
float v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3];
|
||||
float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[ 2]*v2 + m[ 3]*v3 + m[ 4]);
|
||||
float t1 = saturate_cast<float>(m[5]*v0 + m[6]*v1 + m[ 7]*v2 + m[ 8]*v3 + m[ 9]);
|
||||
float t2 = saturate_cast<float>(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]);
|
||||
float t3 = saturate_cast<float>(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]);
|
||||
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; dst[x+3] = t3;
|
||||
}
|
||||
#endif
|
||||
vx_cleanup();
|
||||
return;
|
||||
}
|
||||
@ -1936,9 +1950,9 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
|
||||
{
|
||||
float alpha = *_alpha;
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 v_alpha = vx_setall_f32(alpha);
|
||||
const int cWidth = v_float32::nlanes;
|
||||
const int cWidth = VTraits<v_float32>::vlanes();
|
||||
for (; i <= len - cWidth; i += cWidth)
|
||||
v_store(dst + i, v_muladd(vx_load(src1 + i), v_alpha, vx_load(src2 + i)));
|
||||
vx_cleanup();
|
||||
@ -1953,9 +1967,9 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
|
||||
{
|
||||
double alpha = *_alpha;
|
||||
int i = 0;
|
||||
#if CV_SIMD_64F
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
v_float64 a2 = vx_setall_f64(alpha);
|
||||
const int cWidth = v_float64::nlanes;
|
||||
const int cWidth = VTraits<v_float64>::vlanes();
|
||||
for (; i <= len - cWidth; i += cWidth)
|
||||
v_store(dst + i, v_muladd(vx_load(src1 + i), a2, vx_load(src2 + i)));
|
||||
vx_cleanup();
|
||||
@ -2078,7 +2092,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
|
||||
deltastep = deltastep ? 4 : 0;
|
||||
}
|
||||
|
||||
#if CV_SIMD_64F
|
||||
#if CV_SIMD128_64F
|
||||
v_float64x2 v_scale = v_setall_f64(scale);
|
||||
#endif
|
||||
|
||||
@ -2090,7 +2104,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
|
||||
|
||||
for( j = i; j <= size.width - 4; j += 4 )
|
||||
{
|
||||
#if CV_SIMD_64F
|
||||
#if CV_SIMD128_64F
|
||||
if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
|
||||
{
|
||||
v_float64x2 s0 = v_setzero_f64(), s1 = v_setzero_f64();
|
||||
@ -2150,7 +2164,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
|
||||
|
||||
for( j = i; j <= size.width - 4; j += 4 )
|
||||
{
|
||||
#if CV_SIMD_64F
|
||||
#if CV_SIMD128_64F
|
||||
if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
|
||||
{
|
||||
v_float64x2 s0 = v_setzero_f64(), s1 = v_setzero_f64();
|
||||
@ -2227,7 +2241,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
|
||||
double s = 0;
|
||||
const sT *tsrc1 = src + i*srcstep;
|
||||
const sT *tsrc2 = src + j*srcstep;
|
||||
#if CV_SIMD_64F
|
||||
#if CV_SIMD128_64F
|
||||
if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
|
||||
{
|
||||
const double *v_tsrc1 = (double *)(tsrc1);
|
||||
@ -2280,7 +2294,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
|
||||
delta_buf[2] = delta_buf[3] = tdelta2[0];
|
||||
tdelta2 = delta_buf;
|
||||
}
|
||||
#if CV_SIMD_64F
|
||||
#if CV_SIMD128_64F
|
||||
if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
|
||||
{
|
||||
const double *v_tsrc2 = (double *)(tsrc2);
|
||||
@ -2393,14 +2407,14 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
|
||||
double r = 0;
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 15), blockSize;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int len0 = len & -VTraits<v_uint16>::vlanes(), blockSize0 = (1 << 15), blockSize;
|
||||
|
||||
while (i < len0)
|
||||
{
|
||||
blockSize = std::min(len0 - i, blockSize0);
|
||||
v_uint32 v_sum = vx_setzero_u32();
|
||||
const int cWidth = v_uint16::nlanes;
|
||||
const int cWidth = VTraits<v_uint16>::vlanes();
|
||||
|
||||
int j = 0;
|
||||
for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
|
||||
@ -2414,7 +2428,7 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
|
||||
{
|
||||
v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j));
|
||||
v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j));
|
||||
v_sum += v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20));
|
||||
v_sum = v_add(v_sum, v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20)));
|
||||
}
|
||||
r += (double)v_reduce_sum(v_sum);
|
||||
|
||||
@ -2433,14 +2447,14 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
|
||||
double r = 0.0;
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 14), blockSize;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int len0 = len & -VTraits<v_int16>::vlanes(), blockSize0 = (1 << 14), blockSize;
|
||||
|
||||
while (i < len0)
|
||||
{
|
||||
blockSize = std::min(len0 - i, blockSize0);
|
||||
v_int32 v_sum = vx_setzero_s32();
|
||||
const int cWidth = v_int16::nlanes;
|
||||
const int cWidth = VTraits<v_int16>::vlanes();
|
||||
|
||||
int j = 0;
|
||||
for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
|
||||
@ -2473,14 +2487,14 @@ double dotProd_16u(const ushort* src1, const ushort* src2, int len)
|
||||
double r = 0.0;
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 24), blockSize;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int len0 = len & -VTraits<v_uint16>::vlanes(), blockSize0 = (1 << 24), blockSize;
|
||||
|
||||
while (i < len0)
|
||||
{
|
||||
blockSize = std::min(len0 - i, blockSize0);
|
||||
v_uint64 v_sum = vx_setzero_u64();
|
||||
const int cWidth = v_uint16::nlanes;
|
||||
const int cWidth = VTraits<v_uint16>::vlanes();
|
||||
|
||||
int j = 0;
|
||||
for (; j <= blockSize - cWidth; j += cWidth)
|
||||
@ -2505,14 +2519,14 @@ double dotProd_16s(const short* src1, const short* src2, int len)
|
||||
double r = 0.0;
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 24), blockSize;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int len0 = len & -VTraits<v_int16>::vlanes(), blockSize0 = (1 << 24), blockSize;
|
||||
|
||||
while (i < len0)
|
||||
{
|
||||
blockSize = std::min(len0 - i, blockSize0);
|
||||
v_int64 v_sum = vx_setzero_s64();
|
||||
const int cWidth = v_int16::nlanes;
|
||||
const int cWidth = VTraits<v_int16>::vlanes();
|
||||
|
||||
int j = 0;
|
||||
for (; j <= blockSize - cWidth; j += cWidth)
|
||||
@ -2534,10 +2548,10 @@ double dotProd_16s(const short* src1, const short* src2, int len)
|
||||
|
||||
double dotProd_32s(const int* src1, const int* src2, int len)
|
||||
{
|
||||
#if CV_SIMD_64F
|
||||
#if CV_SIMD_64F // TODO: enable for CV_SIMD_SCALABLE_64F
|
||||
double r = .0;
|
||||
int i = 0;
|
||||
const int step = v_int32::nlanes;
|
||||
const int step = VTraits<v_int32>::vlanes();
|
||||
v_float64 v_sum0 = vx_setzero_f64();
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
const int wstep = step * 2;
|
||||
@ -2572,8 +2586,8 @@ double dotProd_32f(const float* src1, const float* src2, int len)
|
||||
double r = 0.0;
|
||||
int i = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
int len0 = len & -v_float32::nlanes, blockSize0 = (1 << 13), blockSize;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
int len0 = len & -VTraits<v_float32>::vlanes(), blockSize0 = (1 << 13), blockSize;
|
||||
|
||||
while (i < len0)
|
||||
{
|
||||
@ -2581,7 +2595,7 @@ double dotProd_32f(const float* src1, const float* src2, int len)
|
||||
v_float32 v_sum = vx_setzero_f32();
|
||||
|
||||
int j = 0;
|
||||
int cWidth = v_float32::nlanes;
|
||||
int cWidth = VTraits<v_float32>::vlanes();
|
||||
|
||||
#if CV_ENABLE_UNROLLED
|
||||
v_float32 v_sum1 = vx_setzero_f32();
|
||||
@ -2600,7 +2614,7 @@ double dotProd_32f(const float* src1, const float* src2, int len)
|
||||
vx_load(src2 + j + (cWidth * 3)), v_sum3);
|
||||
}
|
||||
|
||||
v_sum += v_sum1 + v_sum2 + v_sum3;
|
||||
v_sum = v_add(v_sum, v_add(v_add(v_sum1, v_sum2), v_sum3));
|
||||
#endif
|
||||
|
||||
for (; j <= blockSize - cWidth; j += cWidth)
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "opencv2/core/detail/dispatch_helper.impl.hpp"
|
||||
|
||||
#include <algorithm> // std::swap_ranges
|
||||
#include <numeric> // std::accumulate
|
||||
|
||||
namespace cv {
|
||||
|
||||
@ -440,7 +441,7 @@ template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const
|
||||
static void
|
||||
flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
|
||||
{
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD128
|
||||
#if CV_STRONG_ALIGNMENT
|
||||
size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
|
||||
#endif
|
||||
@ -563,7 +564,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
|
||||
}
|
||||
#endif
|
||||
else
|
||||
#endif // CV_SIMD
|
||||
#endif // CV_SIMD128
|
||||
{
|
||||
int i, j, limit = (int)(((size.width + 1)/2)*esz);
|
||||
AutoBuffer<int> _tab(size.width*esz);
|
||||
@ -596,12 +597,12 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
|
||||
dst0 += dstep, dst1 -= dstep )
|
||||
{
|
||||
int i = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
#if CV_STRONG_ALIGNMENT
|
||||
if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
|
||||
#endif
|
||||
{
|
||||
for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
|
||||
for (; i <= size.width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_int32 t0 = v_reinterpret_as_s32(vx_load(src0 + i));
|
||||
v_int32 t1 = v_reinterpret_as_s32(vx_load(src1 + i));
|
||||
@ -612,7 +613,7 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
|
||||
#if CV_STRONG_ALIGNMENT
|
||||
else
|
||||
{
|
||||
for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
|
||||
for (; i <= size.width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint8 t0 = vx_load(src0 + i);
|
||||
v_uint8 t1 = vx_load(src1 + i);
|
||||
@ -857,6 +858,223 @@ void flipND(InputArray _src, OutputArray _dst, int _axis)
|
||||
flipNDImpl(dst.ptr(), dst.size.p, dst.step.p, axis);
|
||||
}
|
||||
|
||||
/*
|
||||
This function first prepends 1 to each tensor shape to have a common max_ndims dimension, then flatten non-broadcast dimensions.
|
||||
*/
|
||||
static bool _flatten_for_broadcast(int narrays, int max_ndims, const int* ndims, const int** orig_shape,
|
||||
int** flatten_shape, size_t** flatten_step) {
|
||||
int i, j, k;
|
||||
|
||||
// step 1.
|
||||
// * make all inputs and the output max_ndims-dimensional.
|
||||
// * compute proper step's
|
||||
for (i = max_ndims - 1; i >= 0; i-- ) {
|
||||
for (k = 0; k < narrays; k++) {
|
||||
j = ndims[k] - (max_ndims - i);
|
||||
int sz_i = j >= 0 ? orig_shape[k][j] : 1;
|
||||
size_t st_i = i == max_ndims - 1 ? 1 : flatten_step[k][i+1] * flatten_shape[k][i+1];
|
||||
flatten_shape[k][i] = sz_i;
|
||||
flatten_step[k][i] = st_i;
|
||||
if (flatten_shape[k][i] == 0)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// step 2. Let's do the flattening first,
|
||||
// since we'd need proper values of steps to check continuity.
|
||||
// this loop is probably the most tricky part
|
||||
// in the whole implementation of broadcasting.
|
||||
j = max_ndims-1;
|
||||
for (i = j - 1; i >= 0; i--) {
|
||||
bool all_contiguous = true, all_scalars = true, all_consistent = true;
|
||||
for(k = 0; k < narrays; k++) {
|
||||
size_t st = flatten_step[k][j] * flatten_shape[k][j];
|
||||
bool prev_scalar = flatten_shape[k][j] == 1;
|
||||
bool scalar = flatten_shape[k][i] == 1;
|
||||
all_contiguous = all_contiguous && (st == flatten_step[k][i]);
|
||||
all_scalars = all_scalars && scalar;
|
||||
all_consistent = all_consistent && (scalar == prev_scalar);
|
||||
}
|
||||
if (all_contiguous && (all_consistent || all_scalars)) {
|
||||
for(k = 0; k < narrays; k++)
|
||||
flatten_shape[k][j] *= flatten_shape[k][i];
|
||||
} else {
|
||||
j--;
|
||||
if (i < j) {
|
||||
for(k = 0; k < narrays; k++) {
|
||||
flatten_shape[k][j] = flatten_shape[k][i];
|
||||
flatten_step[k][j] = flatten_step[k][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// step 3. Set some step's to 0's.
|
||||
for (i = max_ndims-1; i >= j; i--) {
|
||||
for (k = 0; k < narrays; k++)
|
||||
flatten_step[k][i] = flatten_shape[k][i] == 1 ? 0 : flatten_step[k][i];
|
||||
}
|
||||
for (; i >= 0; i--) {
|
||||
for (k = 0; k < narrays; k++) {
|
||||
flatten_step[k][i] = 0;
|
||||
flatten_shape[k][i] = 1;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void broadcast(InputArray _src, InputArray _shape, OutputArray _dst) {
|
||||
CV_INSTRUMENT_REGION();
|
||||
|
||||
Mat src = _src.getMat();
|
||||
CV_CheckTrue(src.isContinuous(), "broadcast: input array must be contiguous");
|
||||
CV_CheckChannelsEQ(src.channels(), 1, "broadcast: input array must be single channel");
|
||||
|
||||
Mat shape = _shape.getMat();
|
||||
CV_CheckTypeEQ(shape.type(), CV_32S, "broadcast: target shape must be of type int32");
|
||||
const auto dims_shape = static_cast<int>(shape.total());
|
||||
const auto *ptr_shape = shape.ptr<int>();
|
||||
|
||||
// check valid shape, 1D/0D Mat would fail in the following checks
|
||||
const auto dims_src = src.dims;
|
||||
CV_CheckLE(dims_src, dims_shape,
|
||||
"broadcast: dimension of input array must be less than or equal to dimension of target shape");
|
||||
std::vector<int> shape_src{src.size.p, src.size.p + dims_src};
|
||||
if (shape_src.size() < static_cast<size_t>(dims_shape)) {
|
||||
shape_src.insert(shape_src.begin(), dims_shape - shape_src.size(), 1);
|
||||
}
|
||||
for (int i = 0; i < static_cast<int>(shape_src.size()); ++i) {
|
||||
const auto *shape_target = ptr_shape;
|
||||
if (shape_src[i] != 1) {
|
||||
CV_CheckEQ(shape_src[i], shape_target[i], "target shape must be equal to input shape or 1");
|
||||
}
|
||||
}
|
||||
|
||||
// impl
|
||||
_dst.create(dims_shape, shape.ptr<int>(), src.type());
|
||||
Mat dst = _dst.getMat();
|
||||
std::vector<int> is_same_shape(dims_shape, 0);
|
||||
for (int i = 0; i < static_cast<int>(shape_src.size()); ++i) {
|
||||
if (shape_src[i] == ptr_shape[i]) {
|
||||
is_same_shape[i] = 1;
|
||||
}
|
||||
}
|
||||
// copy if same shape
|
||||
if (std::accumulate(is_same_shape.begin(), is_same_shape.end(), 1, std::multiplies<int>()) != 0) {
|
||||
const auto *p_src = src.ptr<const char>();
|
||||
auto *p_dst = dst.ptr<char>();
|
||||
std::memcpy(p_dst, p_src, dst.total() * dst.elemSize());
|
||||
return;
|
||||
}
|
||||
// other cases
|
||||
int max_ndims = std::max(dims_src, dims_shape);
|
||||
const int all_ndims[2] = {src.dims, dst.dims};
|
||||
const int* orig_shapes[2] = {src.size.p, dst.size.p};
|
||||
cv::AutoBuffer<size_t> buff(max_ndims * 4);
|
||||
int* flatten_shapes[2] = {(int*)buff.data(), (int*)(buff.data() + max_ndims)};
|
||||
size_t* flatten_steps[2] = {(size_t*)(buff.data() + 2 * max_ndims), (size_t*)(buff.data() + 3 * max_ndims)};
|
||||
if (_flatten_for_broadcast(2, max_ndims, all_ndims, orig_shapes, flatten_shapes, flatten_steps)) {
|
||||
size_t src_dp = flatten_steps[0][max_ndims - 1];
|
||||
size_t dst_dp = flatten_steps[1][max_ndims - 1];
|
||||
CV_Assert(dst_dp == 1);
|
||||
CV_Assert(max_ndims >= 2); // >= 3?
|
||||
size_t rowstep_src = flatten_steps[0][max_ndims - 2];
|
||||
size_t rowstep_dst = flatten_steps[1][max_ndims - 2];
|
||||
const char* ptr_src = src.ptr<const char>();
|
||||
char* ptr_dst = dst.ptr<char>();
|
||||
size_t esz = src.elemSize();
|
||||
int nrows = flatten_shapes[1][max_ndims - 2];
|
||||
int ncols = flatten_shapes[1][max_ndims - 1];
|
||||
int nplanes = 1;
|
||||
CV_Check(esz, esz == 1 || esz == 2 || esz == 4 || esz == 8, "broadcast: not supported data type");
|
||||
|
||||
for (int k = 0; k < max_ndims - 2; k++) {
|
||||
nplanes *= flatten_shapes[1][k];
|
||||
}
|
||||
for (int plane_idx = 0; plane_idx < nplanes; plane_idx++) {
|
||||
size_t offset_src = 0, offset_dst = 0;
|
||||
size_t idx = (size_t)plane_idx;
|
||||
for (int k = max_ndims - 3; k >= 0; k--) {
|
||||
size_t prev_idx = idx / flatten_shapes[1][k];
|
||||
size_t i_k = (int)(idx - prev_idx * flatten_shapes[1][k]);
|
||||
offset_src += i_k * flatten_steps[0][k];
|
||||
offset_dst += i_k * flatten_steps[1][k];
|
||||
idx = prev_idx;
|
||||
}
|
||||
|
||||
#define OPENCV_CORE_BROADCAST_LOOP(_Tp) \
|
||||
for (int i = 0; i < nrows; i++) { \
|
||||
const _Tp *ptr_src_ = (const _Tp*)ptr_src + offset_src + rowstep_src * i; \
|
||||
_Tp *ptr_dst_ = (_Tp*)ptr_dst + offset_dst + rowstep_dst * i; \
|
||||
if (src_dp == 1) { \
|
||||
for (int j = 0; j < ncols; j++) { \
|
||||
ptr_dst_[j] = ptr_src_[j]; \
|
||||
} \
|
||||
} else { \
|
||||
_Tp x = *ptr_src_; \
|
||||
for (int j = 0; j < ncols; j++) { \
|
||||
ptr_dst_[j] = x; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
if (esz == 1) {
|
||||
OPENCV_CORE_BROADCAST_LOOP(int8_t);
|
||||
} else if (esz == 2) {
|
||||
OPENCV_CORE_BROADCAST_LOOP(int16_t);
|
||||
} else if (esz == 4) {
|
||||
OPENCV_CORE_BROADCAST_LOOP(int32_t);
|
||||
} else if (esz == 8) {
|
||||
OPENCV_CORE_BROADCAST_LOOP(int64_t);
|
||||
} else {
|
||||
CV_Error(cv::Error::StsNotImplemented, "");
|
||||
}
|
||||
#undef OPENCV_CORE_BROADCAST_LOOP
|
||||
}
|
||||
} else {
|
||||
// initial copy (src to dst)
|
||||
std::vector<size_t> step_src{src.step.p, src.step.p + dims_src};
|
||||
if (step_src.size() < static_cast<size_t>(dims_shape)) {
|
||||
step_src.insert(step_src.begin(), dims_shape - step_src.size(), step_src[0]);
|
||||
}
|
||||
for (size_t i = 0; i < src.total(); ++i) {
|
||||
size_t t = i;
|
||||
size_t src_offset = 0, dst_offset = 0;
|
||||
for (int j = static_cast<int>(shape_src.size() - 1); j >= 0; --j) {
|
||||
size_t idx = t / shape_src[j];
|
||||
size_t offset = static_cast<size_t>(t - idx * shape_src[j]);
|
||||
src_offset += offset * step_src[j];
|
||||
dst_offset += offset * dst.step[j];
|
||||
t = idx;
|
||||
}
|
||||
const auto *p_src = src.ptr<const char>();
|
||||
auto *p_dst = dst.ptr<char>();
|
||||
std::memcpy(p_dst + dst_offset, p_src + src_offset, dst.elemSize());
|
||||
}
|
||||
// broadcast copy (dst inplace)
|
||||
std::vector<int> cumulative_shape(dims_shape, 1);
|
||||
int total = static_cast<int>(dst.total());
|
||||
for (int i = dims_shape - 1; i >= 0; --i) {
|
||||
cumulative_shape[i] = static_cast<int>(total / ptr_shape[i]);
|
||||
total = cumulative_shape[i];
|
||||
}
|
||||
for (int i = dims_shape - 1; i >= 0; --i) {
|
||||
if (is_same_shape[i] == 1) {
|
||||
continue;
|
||||
}
|
||||
auto step = dst.step[i];
|
||||
auto *p_dst = dst.ptr<char>();
|
||||
for (int j = 0; j < cumulative_shape[i]; j++) {
|
||||
for (int k = 0; k < ptr_shape[i] - 1; k++) {
|
||||
std::memcpy(p_dst + step, p_dst, step);
|
||||
p_dst += step;
|
||||
}
|
||||
p_dst += step;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void rotate(InputArray _src, OutputArray _dst, int rotateMode)
|
||||
{
|
||||
CV_Assert(_src.dims() <= 2);
|
||||
|
@ -8,20 +8,24 @@
|
||||
#include "opencv2/core/openvx/ovx_defs.hpp"
|
||||
#include "stat.hpp"
|
||||
|
||||
#ifndef OPENCV_IPP_MEAN
|
||||
#undef HAVE_IPP
|
||||
#undef CV_IPP_RUN_FAST
|
||||
#define CV_IPP_RUN_FAST(f, ...)
|
||||
#undef CV_IPP_RUN
|
||||
#define CV_IPP_RUN(c, f, ...)
|
||||
#endif // OPENCV_IPP_MEAN
|
||||
|
||||
#include "mean.simd.hpp"
|
||||
#include "mean.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
|
||||
|
||||
#ifndef OPENCV_IPP_MEAN
|
||||
#undef HAVE_IPP
|
||||
#undef CV_IPP_RUN_FAST
|
||||
#define CV_IPP_RUN_FAST(f, ...)
|
||||
#undef CV_IPP_RUN
|
||||
#define CV_IPP_RUN(c, f, ...)
|
||||
#endif // OPENCV_IPP_MEAN
|
||||
|
||||
namespace cv {
|
||||
|
||||
|
@ -121,6 +121,7 @@ void merge(const Mat* mv, size_t n, OutputArray _dst)
|
||||
CV_INSTRUMENT_REGION();
|
||||
|
||||
CV_Assert( mv && n > 0 );
|
||||
CV_Assert(!mv[0].empty());
|
||||
|
||||
int depth = mv[0].depth();
|
||||
bool allch1 = true;
|
||||
|
@ -15,7 +15,7 @@ void merge64s(const int64** src, int64* dst, int len, int cn);
|
||||
|
||||
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
/*
|
||||
The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
|
||||
on IA there are instructions movntps and such to which
|
||||
@ -38,7 +38,7 @@ void merge64s(const int64** src, int64* dst, int len, int cn);
|
||||
template<typename T, typename VecT> static void
|
||||
vecmerge_( const T** src, T* dst, int len, int cn )
|
||||
{
|
||||
const int VECSZ = VecT::nlanes;
|
||||
const int VECSZ = VTraits<VecT>::vlanes();
|
||||
int i, i0 = 0;
|
||||
const T* src0 = src[0];
|
||||
const T* src1 = src[1];
|
||||
@ -173,8 +173,8 @@ merge_( const T** src, T* dst, int len, int cn )
|
||||
void merge8u(const uchar** src, uchar* dst, int len, int cn )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
#if CV_SIMD
|
||||
if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if( len >= VTraits<v_uint8>::vlanes() && 2 <= cn && cn <= 4 )
|
||||
vecmerge_<uchar, v_uint8>(src, dst, len, cn);
|
||||
else
|
||||
#endif
|
||||
@ -184,8 +184,8 @@ void merge8u(const uchar** src, uchar* dst, int len, int cn )
|
||||
void merge16u(const ushort** src, ushort* dst, int len, int cn )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
#if CV_SIMD
|
||||
if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if( len >= VTraits<v_uint16>::vlanes() && 2 <= cn && cn <= 4 )
|
||||
vecmerge_<ushort, v_uint16>(src, dst, len, cn);
|
||||
else
|
||||
#endif
|
||||
@ -195,8 +195,8 @@ void merge16u(const ushort** src, ushort* dst, int len, int cn )
|
||||
void merge32s(const int** src, int* dst, int len, int cn )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
#if CV_SIMD
|
||||
if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if( len >= VTraits<v_int32>::vlanes() && 2 <= cn && cn <= 4 )
|
||||
vecmerge_<int, v_int32>(src, dst, len, cn);
|
||||
else
|
||||
#endif
|
||||
@ -206,8 +206,8 @@ void merge32s(const int** src, int* dst, int len, int cn )
|
||||
void merge64s(const int64** src, int64* dst, int len, int cn )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
#if CV_SIMD
|
||||
if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if( len >= VTraits<v_int64>::vlanes() && 2 <= cn && cn <= 4 )
|
||||
vecmerge_<int64, v_int64>(src, dst, len, cn);
|
||||
else
|
||||
#endif
|
||||
|
@ -11,11 +11,13 @@
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#ifndef OPENCV_IPP_MINMAX
|
||||
#undef HAVE_IPP
|
||||
#undef CV_IPP_RUN_FAST
|
||||
#define CV_IPP_RUN_FAST(f, ...)
|
||||
#undef CV_IPP_RUN
|
||||
#define CV_IPP_RUN(c, f, ...)
|
||||
#endif // OPENCV_IPP_MINMAX
|
||||
|
||||
#define IPP_DISABLE_MINMAXIDX_MANY_ROWS 1 // see Core_MinMaxIdx.rows_overflow test
|
||||
|
||||
|
@ -63,25 +63,25 @@ int normHamming(const uchar* a, int n, int cellSize)
|
||||
return -1;
|
||||
int i = 0;
|
||||
int result = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_uint64 t = vx_setzero_u64();
|
||||
if ( cellSize == 2)
|
||||
{
|
||||
v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
|
||||
for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
|
||||
for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
|
||||
t += v_popcount(v_reinterpret_as_u64((a0 | (a0 >> 1)) & mask));
|
||||
t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a0, v_shr<1>(a0)), mask))));
|
||||
}
|
||||
}
|
||||
else // cellSize == 4
|
||||
{
|
||||
v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
|
||||
for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
|
||||
for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
|
||||
v_uint16 a1 = a0 | (a0 >> 2);
|
||||
t += v_popcount(v_reinterpret_as_u64((a1 | (a1 >> 1)) & mask));
|
||||
v_uint16 a1 = v_or(a0, v_shr<2>(a0));
|
||||
t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a1, v_shr<1>(a1)), mask))));
|
||||
|
||||
}
|
||||
}
|
||||
@ -109,25 +109,25 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
|
||||
return -1;
|
||||
int i = 0;
|
||||
int result = 0;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_uint64 t = vx_setzero_u64();
|
||||
if ( cellSize == 2)
|
||||
{
|
||||
v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
|
||||
for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
|
||||
for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
|
||||
t += v_popcount(v_reinterpret_as_u64((ab0 | (ab0 >> 1)) & mask));
|
||||
v_uint16 ab0 = v_reinterpret_as_u16(v_xor(vx_load(a + i), vx_load(b + i)));
|
||||
t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(ab0, v_shr<1>(ab0)), mask))));
|
||||
}
|
||||
}
|
||||
else // cellSize == 4
|
||||
{
|
||||
v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
|
||||
for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
|
||||
for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
|
||||
v_uint16 ab1 = ab0 | (ab0 >> 2);
|
||||
t += v_popcount(v_reinterpret_as_u64((ab1 | (ab1 >> 1)) & mask));
|
||||
v_uint16 ab0 = v_reinterpret_as_u16(v_xor(vx_load(a + i), vx_load(b + i)));
|
||||
v_uint16 ab1 = v_or(ab0, v_shr<2>(ab0));
|
||||
t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(ab1, v_shr<1>(ab1)), mask))));
|
||||
}
|
||||
}
|
||||
result += (int)v_reduce_sum(t);
|
||||
@ -145,21 +145,21 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
|
||||
float normL2Sqr_(const float* a, const float* b, int n)
|
||||
{
|
||||
int j = 0; float d = 0.f;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
|
||||
v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
|
||||
for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
|
||||
for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_float32 t0 = vx_load(a + j) - vx_load(b + j);
|
||||
v_float32 t1 = vx_load(a + j + v_float32::nlanes) - vx_load(b + j + v_float32::nlanes);
|
||||
v_float32 t0 = v_sub(vx_load(a + j), vx_load(b + j));
|
||||
v_float32 t1 = v_sub(vx_load(a + j + VTraits<v_float32>::vlanes()), vx_load(b + j + VTraits<v_float32>::vlanes()));
|
||||
v_d0 = v_muladd(t0, t0, v_d0);
|
||||
v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes);
|
||||
v_float32 t2 = v_sub(vx_load(a + j + 2 * VTraits<v_float32>::vlanes()), vx_load(b + j + 2 * VTraits<v_float32>::vlanes()));
|
||||
v_d1 = v_muladd(t1, t1, v_d1);
|
||||
v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes);
|
||||
v_float32 t3 = v_sub(vx_load(a + j + 3 * VTraits<v_float32>::vlanes()), vx_load(b + j + 3 * VTraits<v_float32>::vlanes()));
|
||||
v_d2 = v_muladd(t2, t2, v_d2);
|
||||
v_d3 = v_muladd(t3, t3, v_d3);
|
||||
}
|
||||
d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
|
||||
d = v_reduce_sum(v_add(v_add(v_add(v_d0, v_d1), v_d2), v_d3));
|
||||
#endif
|
||||
for( ; j < n; j++ )
|
||||
{
|
||||
@ -173,17 +173,17 @@ float normL2Sqr_(const float* a, const float* b, int n)
|
||||
float normL1_(const float* a, const float* b, int n)
|
||||
{
|
||||
int j = 0; float d = 0.f;
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
|
||||
v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
|
||||
for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
|
||||
for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_d0 += v_absdiff(vx_load(a + j), vx_load(b + j));
|
||||
v_d1 += v_absdiff(vx_load(a + j + v_float32::nlanes), vx_load(b + j + v_float32::nlanes));
|
||||
v_d2 += v_absdiff(vx_load(a + j + 2 * v_float32::nlanes), vx_load(b + j + 2 * v_float32::nlanes));
|
||||
v_d3 += v_absdiff(vx_load(a + j + 3 * v_float32::nlanes), vx_load(b + j + 3 * v_float32::nlanes));
|
||||
v_d0 = v_add(v_d0, v_absdiff(vx_load(a + j), vx_load(b + j)));
|
||||
v_d1 = v_add(v_d1, v_absdiff(vx_load(a + j + VTraits<v_float32>::vlanes()), vx_load(b + j + VTraits<v_float32>::vlanes())));
|
||||
v_d2 = v_add(v_d2, v_absdiff(vx_load(a + j + 2 * VTraits<v_float32>::vlanes()), vx_load(b + j + 2 * VTraits<v_float32>::vlanes())));
|
||||
v_d3 = v_add(v_d3, v_absdiff(vx_load(a + j + 3 * VTraits<v_float32>::vlanes()), vx_load(b + j + 3 * VTraits<v_float32>::vlanes())));
|
||||
}
|
||||
d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
|
||||
d = v_reduce_sum(v_add(v_add(v_add(v_d0, v_d1), v_d2), v_d3));
|
||||
#endif
|
||||
for( ; j < n; j++ )
|
||||
d += std::abs(a[j] - b[j]);
|
||||
@ -193,12 +193,12 @@ float normL1_(const float* a, const float* b, int n)
|
||||
int normL1_(const uchar* a, const uchar* b, int n)
|
||||
{
|
||||
int j = 0, d = 0;
|
||||
#if CV_SIMD
|
||||
for (; j <= n - 4 * v_uint8::nlanes; j += 4 * v_uint8::nlanes)
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
for (; j <= n - 4 * VTraits<v_uint8>::vlanes(); j += 4 * VTraits<v_uint8>::vlanes())
|
||||
d += v_reduce_sad(vx_load(a + j), vx_load(b + j)) +
|
||||
v_reduce_sad(vx_load(a + j + v_uint8::nlanes), vx_load(b + j + v_uint8::nlanes)) +
|
||||
v_reduce_sad(vx_load(a + j + 2 * v_uint8::nlanes), vx_load(b + j + 2 * v_uint8::nlanes)) +
|
||||
v_reduce_sad(vx_load(a + j + 3 * v_uint8::nlanes), vx_load(b + j + 3 * v_uint8::nlanes));
|
||||
v_reduce_sad(vx_load(a + j + VTraits<v_uint8>::vlanes()), vx_load(b + j + VTraits<v_uint8>::vlanes())) +
|
||||
v_reduce_sad(vx_load(a + j + 2 * VTraits<v_uint8>::vlanes()), vx_load(b + j + 2 * VTraits<v_uint8>::vlanes())) +
|
||||
v_reduce_sad(vx_load(a + j + 3 * VTraits<v_uint8>::vlanes()), vx_load(b + j + 3 * VTraits<v_uint8>::vlanes()));
|
||||
#endif
|
||||
for( ; j < n; j++ )
|
||||
d += std::abs(a[j] - b[j]);
|
||||
|
@ -322,16 +322,20 @@ int decodeSimpleFormat( const char* dt )
|
||||
|
||||
}
|
||||
|
||||
#if defined __i386__ || defined(_M_IX86) || defined __x86_64__ || defined(_M_X64)
|
||||
#define CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS 1
|
||||
#if defined __i386__ || defined(_M_IX86) || defined __x86_64__ || defined(_M_X64) || \
|
||||
(defined (__LITTLE_ENDIAN__) && __LITTLE_ENDIAN__)
|
||||
#define CV_LITTLE_ENDIAN_MEM_ACCESS 1
|
||||
#else
|
||||
#define CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS 0
|
||||
#define CV_LITTLE_ENDIAN_MEM_ACCESS 0
|
||||
#endif
|
||||
|
||||
static inline int readInt(const uchar* p)
|
||||
{
|
||||
#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
|
||||
return *(const int*)p;
|
||||
// On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
|
||||
#if CV_LITTLE_ENDIAN_MEM_ACCESS
|
||||
int val;
|
||||
memcpy(&val, p, sizeof(val));
|
||||
return val;
|
||||
#else
|
||||
int val = (int)(p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24));
|
||||
return val;
|
||||
@ -340,8 +344,11 @@ static inline int readInt(const uchar* p)
|
||||
|
||||
static inline double readReal(const uchar* p)
|
||||
{
|
||||
#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
|
||||
return *(const double*)p;
|
||||
// On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
|
||||
#if CV_LITTLE_ENDIAN_MEM_ACCESS
|
||||
double val;
|
||||
memcpy(&val, p, sizeof(val));
|
||||
return val;
|
||||
#else
|
||||
unsigned val0 = (unsigned)(p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24));
|
||||
unsigned val1 = (unsigned)(p[4] | (p[5] << 8) | (p[6] << 16) | (p[7] << 24));
|
||||
@ -353,9 +360,9 @@ static inline double readReal(const uchar* p)
|
||||
|
||||
static inline void writeInt(uchar* p, int ival)
|
||||
{
|
||||
#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
|
||||
int* ip = (int*)p;
|
||||
*ip = ival;
|
||||
// On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
|
||||
#if CV_LITTLE_ENDIAN_MEM_ACCESS
|
||||
memcpy(p, &ival, sizeof(ival));
|
||||
#else
|
||||
p[0] = (uchar)ival;
|
||||
p[1] = (uchar)(ival >> 8);
|
||||
@ -366,9 +373,9 @@ static inline void writeInt(uchar* p, int ival)
|
||||
|
||||
static inline void writeReal(uchar* p, double fval)
|
||||
{
|
||||
#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
|
||||
double* fp = (double*)p;
|
||||
*fp = fval;
|
||||
// On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
|
||||
#if CV_LITTLE_ENDIAN_MEM_ACCESS
|
||||
memcpy(p, &fval, sizeof(fval));
|
||||
#else
|
||||
Cv64suf v;
|
||||
v.f = fval;
|
||||
|
@ -308,8 +308,8 @@ public:
|
||||
|
||||
if( !multiline )
|
||||
{
|
||||
ptr = fs->resizeWriteBuffer( ptr, len + 9 );
|
||||
sprintf( ptr, "<!-- %s -->", comment );
|
||||
ptr = fs->resizeWriteBuffer( ptr, len + 5+4+1 );
|
||||
snprintf( ptr, len + 5+4+1, "<!-- %s -->", comment );
|
||||
len = (int)strlen(ptr);
|
||||
}
|
||||
else
|
||||
@ -344,7 +344,7 @@ public:
|
||||
fs->setBufferPtr(ptr);
|
||||
ptr = fs->flush();
|
||||
}
|
||||
sprintf( ptr, "-->" );
|
||||
strcpy( ptr, "-->" );
|
||||
fs->setBufferPtr(ptr + 3);
|
||||
fs->flush();
|
||||
}
|
||||
|
@ -15,12 +15,12 @@ void split64s(const int64* src, int64** dst, int len, int cn);
|
||||
|
||||
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
// see the comments for vecmerge_ in merge.cpp
|
||||
template<typename T, typename VecT> static void
|
||||
vecsplit_( const T* src, T** dst, int len, int cn )
|
||||
{
|
||||
const int VECSZ = VecT::nlanes;
|
||||
const int VECSZ = VTraits<VecT>::vlanes();
|
||||
int i, i0 = 0;
|
||||
T* dst0 = dst[0];
|
||||
T* dst1 = dst[1];
|
||||
@ -177,8 +177,8 @@ split_( const T* src, T** dst, int len, int cn )
|
||||
void split8u(const uchar* src, uchar** dst, int len, int cn )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
#if CV_SIMD
|
||||
if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if( len >= VTraits<v_uint8>::vlanes() && 2 <= cn && cn <= 4 )
|
||||
vecsplit_<uchar, v_uint8>(src, dst, len, cn);
|
||||
else
|
||||
#endif
|
||||
@ -188,8 +188,8 @@ void split8u(const uchar* src, uchar** dst, int len, int cn )
|
||||
void split16u(const ushort* src, ushort** dst, int len, int cn )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
#if CV_SIMD
|
||||
if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if( len >= VTraits<v_uint16>::vlanes() && 2 <= cn && cn <= 4 )
|
||||
vecsplit_<ushort, v_uint16>(src, dst, len, cn);
|
||||
else
|
||||
#endif
|
||||
@ -199,8 +199,8 @@ void split16u(const ushort* src, ushort** dst, int len, int cn )
|
||||
void split32s(const int* src, int** dst, int len, int cn )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
#if CV_SIMD
|
||||
if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if( len >= VTraits<v_uint32>::vlanes() && 2 <= cn && cn <= 4 )
|
||||
vecsplit_<int, v_int32>(src, dst, len, cn);
|
||||
else
|
||||
#endif
|
||||
@ -210,8 +210,8 @@ void split32s(const int* src, int** dst, int len, int cn )
|
||||
void split64s(const int64* src, int64** dst, int len, int cn )
|
||||
{
|
||||
CV_INSTRUMENT_REGION();
|
||||
#if CV_SIMD
|
||||
if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
if( len >= VTraits<v_int64>::vlanes() && 2 <= cn && cn <= 4 )
|
||||
vecsplit_<int64, v_int64>(src, dst, len, cn);
|
||||
else
|
||||
#endif
|
||||
|
@ -33,11 +33,11 @@ int normHamming(const uchar* a, int n)
|
||||
int i = 0;
|
||||
int result = 0;
|
||||
|
||||
#if CV_SIMD && CV_SIMD_WIDTH > 16
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
{
|
||||
v_uint64 t = vx_setzero_u64();
|
||||
for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
|
||||
t += v_popcount(v_reinterpret_as_u64(vx_load(a + i)));
|
||||
for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
|
||||
t = v_add(t, v_popcount(v_reinterpret_as_u64(vx_load(a + i))));
|
||||
result = (int)v_reduce_sum(t);
|
||||
vx_cleanup();
|
||||
}
|
||||
@ -56,13 +56,6 @@ int normHamming(const uchar* a, int n)
|
||||
result += CV_POPCNT_U32(*(uint*)(a + i));
|
||||
}
|
||||
}
|
||||
#elif CV_SIMD
|
||||
{
|
||||
v_uint64x2 t = v_setzero_u64();
|
||||
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
|
||||
t += v_popcount(v_reinterpret_as_u64(v_load(a + i)));
|
||||
result += (int)v_reduce_sum(t);
|
||||
}
|
||||
#endif
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for(; i <= n - 4; i += 4)
|
||||
@ -85,11 +78,11 @@ int normHamming(const uchar* a, const uchar* b, int n)
|
||||
int i = 0;
|
||||
int result = 0;
|
||||
|
||||
#if CV_SIMD && CV_SIMD_WIDTH > 16
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
{
|
||||
v_uint64 t = vx_setzero_u64();
|
||||
for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
|
||||
t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
|
||||
for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
|
||||
t = v_add(t, v_popcount(v_reinterpret_as_u64(v_xor(vx_load(a + i), vx_load(b + i)))));
|
||||
result += (int)v_reduce_sum(t);
|
||||
}
|
||||
#endif
|
||||
@ -107,13 +100,6 @@ int normHamming(const uchar* a, const uchar* b, int n)
|
||||
result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
|
||||
}
|
||||
}
|
||||
#elif CV_SIMD
|
||||
{
|
||||
v_uint64x2 t = v_setzero_u64();
|
||||
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
|
||||
t += v_popcount(v_reinterpret_as_u64(v_load(a + i) ^ v_load(b + i)));
|
||||
result += (int)v_reduce_sum(t);
|
||||
}
|
||||
#endif
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for(; i <= n - 4; i += 4)
|
||||
|
@ -10,11 +10,13 @@
|
||||
#include "sum.simd.hpp"
|
||||
#include "sum.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
|
||||
|
||||
#ifndef OPENCV_IPP_SUM
|
||||
#undef HAVE_IPP
|
||||
#undef CV_IPP_RUN_FAST
|
||||
#define CV_IPP_RUN_FAST(f, ...)
|
||||
#undef CV_IPP_RUN
|
||||
#define CV_IPP_RUN(c, f, ...)
|
||||
#endif // OPENCV_IPP_SUM
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
@ -22,7 +22,7 @@ struct Sum_SIMD
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_SIMD
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
|
||||
template <>
|
||||
struct Sum_SIMD<uchar, int>
|
||||
@ -36,41 +36,41 @@ struct Sum_SIMD<uchar, int>
|
||||
int x = 0;
|
||||
v_uint32 v_sum = vx_setzero_u32();
|
||||
|
||||
int len0 = len & -v_uint8::nlanes;
|
||||
int len0 = len & -VTraits<v_uint8>::vlanes();
|
||||
while (x < len0)
|
||||
{
|
||||
const int len_tmp = min(x + 256*v_uint16::nlanes, len0);
|
||||
const int len_tmp = min(x + 256*VTraits<v_uint16>::vlanes(), len0);
|
||||
v_uint16 v_sum16 = vx_setzero_u16();
|
||||
for (; x < len_tmp; x += v_uint8::nlanes)
|
||||
for (; x < len_tmp; x += VTraits<v_uint8>::vlanes())
|
||||
{
|
||||
v_uint16 v_src0, v_src1;
|
||||
v_expand(vx_load(src0 + x), v_src0, v_src1);
|
||||
v_sum16 += v_src0 + v_src1;
|
||||
v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1));
|
||||
}
|
||||
v_uint32 v_half0, v_half1;
|
||||
v_expand(v_sum16, v_half0, v_half1);
|
||||
v_sum += v_half0 + v_half1;
|
||||
v_sum = v_add(v_sum, v_add(v_half0, v_half1));
|
||||
}
|
||||
if (x <= len - v_uint16::nlanes)
|
||||
if (x <= len - VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint32 v_half0, v_half1;
|
||||
v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
|
||||
v_sum += v_half0 + v_half1;
|
||||
x += v_uint16::nlanes;
|
||||
v_sum = v_add(v_sum, v_add(v_half0, v_half1));
|
||||
x += VTraits<v_uint16>::vlanes();
|
||||
}
|
||||
if (x <= len - v_uint32::nlanes)
|
||||
if (x <= len - VTraits<v_uint32>::vlanes())
|
||||
{
|
||||
v_sum += vx_load_expand_q(src0 + x);
|
||||
x += v_uint32::nlanes;
|
||||
v_sum = v_add(v_sum, vx_load_expand_q(src0 + x));
|
||||
x += VTraits<v_uint32>::vlanes();
|
||||
}
|
||||
|
||||
if (cn == 1)
|
||||
*dst += v_reduce_sum(v_sum);
|
||||
else
|
||||
{
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes];
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_uint32>::max_nlanes];
|
||||
v_store_aligned(ar, v_sum);
|
||||
for (int i = 0; i < v_uint32::nlanes; ++i)
|
||||
for (int i = 0; i < VTraits<v_uint32>::vlanes(); ++i)
|
||||
dst[i % cn] += ar[i];
|
||||
}
|
||||
v_cleanup();
|
||||
@ -91,41 +91,41 @@ struct Sum_SIMD<schar, int>
|
||||
int x = 0;
|
||||
v_int32 v_sum = vx_setzero_s32();
|
||||
|
||||
int len0 = len & -v_int8::nlanes;
|
||||
int len0 = len & -VTraits<v_int8>::vlanes();
|
||||
while (x < len0)
|
||||
{
|
||||
const int len_tmp = min(x + 256*v_int16::nlanes, len0);
|
||||
const int len_tmp = min(x + 256*VTraits<v_int16>::vlanes(), len0);
|
||||
v_int16 v_sum16 = vx_setzero_s16();
|
||||
for (; x < len_tmp; x += v_int8::nlanes)
|
||||
for (; x < len_tmp; x += VTraits<v_int8>::vlanes())
|
||||
{
|
||||
v_int16 v_src0, v_src1;
|
||||
v_expand(vx_load(src0 + x), v_src0, v_src1);
|
||||
v_sum16 += v_src0 + v_src1;
|
||||
v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1));
|
||||
}
|
||||
v_int32 v_half0, v_half1;
|
||||
v_expand(v_sum16, v_half0, v_half1);
|
||||
v_sum += v_half0 + v_half1;
|
||||
v_sum = v_add(v_sum, v_add(v_half0, v_half1));
|
||||
}
|
||||
if (x <= len - v_int16::nlanes)
|
||||
if (x <= len - VTraits<v_int16>::vlanes())
|
||||
{
|
||||
v_int32 v_half0, v_half1;
|
||||
v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
|
||||
v_sum += v_half0 + v_half1;
|
||||
x += v_int16::nlanes;
|
||||
v_sum = v_add(v_sum, v_add(v_half0, v_half1));
|
||||
x += VTraits<v_int16>::vlanes();
|
||||
}
|
||||
if (x <= len - v_int32::nlanes)
|
||||
if (x <= len - VTraits<v_int32>::vlanes())
|
||||
{
|
||||
v_sum += vx_load_expand_q(src0 + x);
|
||||
x += v_int32::nlanes;
|
||||
v_sum = v_add(v_sum, vx_load_expand_q(src0 + x));
|
||||
x += VTraits<v_int32>::vlanes();
|
||||
}
|
||||
|
||||
if (cn == 1)
|
||||
*dst += v_reduce_sum(v_sum);
|
||||
else
|
||||
{
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_int32>::max_nlanes];
|
||||
v_store_aligned(ar, v_sum);
|
||||
for (int i = 0; i < v_int32::nlanes; ++i)
|
||||
for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
|
||||
dst[i % cn] += ar[i];
|
||||
}
|
||||
v_cleanup();
|
||||
@ -146,25 +146,25 @@ struct Sum_SIMD<ushort, int>
|
||||
int x = 0;
|
||||
v_uint32 v_sum = vx_setzero_u32();
|
||||
|
||||
for (; x <= len - v_uint16::nlanes; x += v_uint16::nlanes)
|
||||
for (; x <= len - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
|
||||
{
|
||||
v_uint32 v_src0, v_src1;
|
||||
v_expand(vx_load(src0 + x), v_src0, v_src1);
|
||||
v_sum += v_src0 + v_src1;
|
||||
v_sum = v_add(v_sum, v_add(v_src0, v_src1));
|
||||
}
|
||||
if (x <= len - v_uint32::nlanes)
|
||||
if (x <= len - VTraits<v_uint32>::vlanes())
|
||||
{
|
||||
v_sum += vx_load_expand(src0 + x);
|
||||
x += v_uint32::nlanes;
|
||||
v_sum = v_add(v_sum, vx_load_expand(src0 + x));
|
||||
x += VTraits<v_uint32>::vlanes();
|
||||
}
|
||||
|
||||
if (cn == 1)
|
||||
*dst += v_reduce_sum(v_sum);
|
||||
else
|
||||
{
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes];
|
||||
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_uint32>::max_nlanes];
|
||||
v_store_aligned(ar, v_sum);
|
||||
for (int i = 0; i < v_uint32::nlanes; ++i)
|
||||
for (int i = 0; i < VTraits<v_uint32>::vlanes(); ++i)
|
||||
dst[i % cn] += ar[i];
|
||||
}
|
||||
v_cleanup();
|
||||
@ -185,25 +185,25 @@ struct Sum_SIMD<short, int>
|
||||
int x = 0;
|
||||
v_int32 v_sum = vx_setzero_s32();
|
||||
|
||||
for (; x <= len - v_int16::nlanes; x += v_int16::nlanes)
|
||||
for (; x <= len - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
|
||||
{
|
||||
v_int32 v_src0, v_src1;
|
||||
v_expand(vx_load(src0 + x), v_src0, v_src1);
|
||||
v_sum += v_src0 + v_src1;
|
||||
v_sum = v_add(v_sum, v_add(v_src0, v_src1));
|
||||
}
|
||||
if (x <= len - v_int32::nlanes)
|
||||
if (x <= len - VTraits<v_int32>::vlanes())
|
||||
{
|
||||
v_sum += vx_load_expand(src0 + x);
|
||||
x += v_int32::nlanes;
|
||||
v_sum = v_add(v_sum, vx_load_expand(src0 + x));
|
||||
x += VTraits<v_int32>::vlanes();
|
||||
}
|
||||
|
||||
if (cn == 1)
|
||||
*dst += v_reduce_sum(v_sum);
|
||||
else
|
||||
{
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
|
||||
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_int32>::max_nlanes];
|
||||
v_store_aligned(ar, v_sum);
|
||||
for (int i = 0; i < v_int32::nlanes; ++i)
|
||||
for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
|
||||
dst[i % cn] += ar[i];
|
||||
}
|
||||
v_cleanup();
|
||||
@ -212,7 +212,7 @@ struct Sum_SIMD<short, int>
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_SIMD_64F
|
||||
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
|
||||
template <>
|
||||
struct Sum_SIMD<int, double>
|
||||
{
|
||||
@ -226,24 +226,24 @@ struct Sum_SIMD<int, double>
|
||||
v_float64 v_sum0 = vx_setzero_f64();
|
||||
v_float64 v_sum1 = vx_setzero_f64();
|
||||
|
||||
for (; x <= len - 2 * v_int32::nlanes; x += 2 * v_int32::nlanes)
|
||||
for (; x <= len - 2 * VTraits<v_int32>::vlanes(); x += 2 * VTraits<v_int32>::vlanes())
|
||||
{
|
||||
v_int32 v_src0 = vx_load(src0 + x);
|
||||
v_int32 v_src1 = vx_load(src0 + x + v_int32::nlanes);
|
||||
v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1);
|
||||
v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1);
|
||||
v_int32 v_src1 = vx_load(src0 + x + VTraits<v_int32>::vlanes());
|
||||
v_sum0 = v_add(v_sum0, v_add(v_cvt_f64(v_src0), v_cvt_f64(v_src1)));
|
||||
v_sum1 = v_add(v_sum1, v_add(v_cvt_f64_high(v_src0), v_cvt_f64_high(v_src1)));
|
||||
}
|
||||
|
||||
#if CV_SIMD256 || CV_SIMD512
|
||||
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes];
|
||||
v_store_aligned(ar, v_sum0 + v_sum1);
|
||||
for (int i = 0; i < v_float64::nlanes; ++i)
|
||||
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_float64>::max_nlanes];
|
||||
v_store_aligned(ar, v_add(v_sum0, v_sum1));
|
||||
for (int i = 0; i < VTraits<v_float64>::vlanes(); ++i)
|
||||
dst[i % cn] += ar[i];
|
||||
#else
|
||||
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes];
|
||||
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits<v_float64>::max_nlanes];
|
||||
v_store_aligned(ar, v_sum0);
|
||||
v_store_aligned(ar + v_float64::nlanes, v_sum1);
|
||||
for (int i = 0; i < 2 * v_float64::nlanes; ++i)
|
||||
v_store_aligned(ar + VTraits<v_float64>::vlanes(), v_sum1);
|
||||
for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
|
||||
dst[i % cn] += ar[i];
|
||||
#endif
|
||||
v_cleanup();
|
||||
@ -265,24 +265,24 @@ struct Sum_SIMD<float, double>
|
||||
v_float64 v_sum0 = vx_setzero_f64();
|
||||
v_float64 v_sum1 = vx_setzero_f64();
|
||||
|
||||
for (; x <= len - 2 * v_float32::nlanes; x += 2 * v_float32::nlanes)
|
||||
for (; x <= len - 2 * VTraits<v_float32>::vlanes(); x += 2 * VTraits<v_float32>::vlanes())
|
||||
{
|
||||
v_float32 v_src0 = vx_load(src0 + x);
|
||||
v_float32 v_src1 = vx_load(src0 + x + v_float32::nlanes);
|
||||
v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1);
|
||||
v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1);
|
||||
v_float32 v_src1 = vx_load(src0 + x + VTraits<v_float32>::vlanes());
|
||||
v_sum0 = v_add(v_sum0, v_add(v_cvt_f64(v_src0), v_cvt_f64(v_src1)));
|
||||
v_sum1 = v_add(v_sum1, v_add(v_cvt_f64_high(v_src0), v_cvt_f64_high(v_src1)));
|
||||
}
|
||||
|
||||
#if CV_SIMD256 || CV_SIMD512
|
||||
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes];
|
||||
v_store_aligned(ar, v_sum0 + v_sum1);
|
||||
for (int i = 0; i < v_float64::nlanes; ++i)
|
||||
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_float64>::max_nlanes];
|
||||
v_store_aligned(ar, v_add(v_sum0, v_sum1));
|
||||
for (int i = 0; i < VTraits<v_float64>::vlanes(); ++i)
|
||||
dst[i % cn] += ar[i];
|
||||
#else
|
||||
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes];
|
||||
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits<v_float64>::max_nlanes];
|
||||
v_store_aligned(ar, v_sum0);
|
||||
v_store_aligned(ar + v_float64::nlanes, v_sum1);
|
||||
for (int i = 0; i < 2 * v_float64::nlanes; ++i)
|
||||
v_store_aligned(ar + VTraits<v_float64>::vlanes(), v_sum1);
|
||||
for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
|
||||
dst[i % cn] += ar[i];
|
||||
#endif
|
||||
v_cleanup();
|
||||
|
@ -34,7 +34,7 @@
|
||||
#include <errno.h>
|
||||
#include <io.h>
|
||||
#include <stdio.h>
|
||||
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
|
||||
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
@ -343,7 +343,7 @@ private:
|
||||
Impl& operator=(const Impl&); // disabled
|
||||
};
|
||||
|
||||
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
|
||||
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
|
||||
|
||||
struct FileLock::Impl
|
||||
{
|
||||
@ -457,7 +457,7 @@ cv::String getCacheDirectory(const char* sub_directory_name, const char* configu
|
||||
default_cache_path = "/tmp/";
|
||||
CV_LOG_WARNING(NULL, "Using world accessible cache directory. This may be not secure: " << default_cache_path);
|
||||
}
|
||||
#elif defined __linux__ || defined __HAIKU__ || defined __FreeBSD__
|
||||
#elif defined __linux__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
|
||||
// https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html
|
||||
if (default_cache_path.empty())
|
||||
{
|
||||
|
@ -2292,6 +2292,139 @@ INSTANTIATE_TEST_CASE_P(Arithm, FlipND, testing::Combine(
|
||||
testing::Values(perf::MatType(CV_8UC1), CV_32FC1)
|
||||
));
|
||||
|
||||
TEST(BroadcastTo, basic) {
|
||||
std::vector<int> shape_src{2, 1};
|
||||
std::vector<int> data_src{1, 2};
|
||||
Mat src(static_cast<int>(shape_src.size()), shape_src.data(), CV_32SC1, data_src.data());
|
||||
|
||||
auto get_index = [](const std::vector<int>& shape, size_t cnt) {
|
||||
std::vector<int> index(shape.size());
|
||||
size_t t = cnt;
|
||||
for (int i = static_cast<int>(shape.size() - 1); i >= 0; --i) {
|
||||
size_t idx = t / shape[i];
|
||||
index[i] = static_cast<int>(t - idx * shape[i]);
|
||||
t = idx;
|
||||
}
|
||||
return index;
|
||||
};
|
||||
|
||||
auto fn_verify = [&get_index](const Mat& ref, const Mat& res) {
|
||||
// check type
|
||||
EXPECT_EQ(ref.type(), res.type());
|
||||
// check shape
|
||||
EXPECT_EQ(ref.dims, res.dims);
|
||||
for (int i = 0; i < ref.dims; ++i) {
|
||||
EXPECT_EQ(ref.size[i], res.size[i]);
|
||||
}
|
||||
// check value
|
||||
std::vector<int> shape{ref.size.p, ref.size.p + ref.dims};
|
||||
for (size_t i = 0; i < ref.total(); ++i) {
|
||||
auto index = get_index(shape, i);
|
||||
switch (ref.type()) {
|
||||
case CV_32SC1: {
|
||||
ASSERT_EQ(ref.at<int>(index.data()), res.at<int>(index.data()));
|
||||
} break;
|
||||
case CV_8UC1: {
|
||||
ASSERT_EQ(ref.at<uint8_t>(index.data()), res.at<uint8_t>(index.data()));
|
||||
} break;
|
||||
case CV_32FC1: {
|
||||
ASSERT_EQ(ref.at<float>(index.data()), res.at<float>(index.data()));
|
||||
} break;
|
||||
default: FAIL() << "Unsupported type: " << ref.type();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
{
|
||||
std::vector<int> shape{4, 2, 3};
|
||||
std::vector<int> data_ref{
|
||||
1, 1, 1, // [0, 0, :]
|
||||
2, 2, 2, // [0, 1, :]
|
||||
1, 1, 1, // [1, 0, :]
|
||||
2, 2, 2, // [1, 1, :]
|
||||
1, 1, 1, // [2, 0, :]
|
||||
2, 2, 2, // [2, 1, :]
|
||||
1, 1, 1, // [3, 0, :]
|
||||
2, 2, 2 // [3, 1, :]
|
||||
};
|
||||
Mat ref(static_cast<int>(shape.size()), shape.data(), src.type(), data_ref.data());
|
||||
Mat dst;
|
||||
broadcast(src, shape, dst);
|
||||
fn_verify(ref, dst);
|
||||
}
|
||||
|
||||
{
|
||||
Mat _src;
|
||||
src.convertTo(_src, CV_8U);
|
||||
std::vector<int> shape{4, 2, 3};
|
||||
std::vector<uint8_t> data_ref{
|
||||
1, 1, 1, // [0, 0, :]
|
||||
2, 2, 2, // [0, 1, :]
|
||||
1, 1, 1, // [1, 0, :]
|
||||
2, 2, 2, // [1, 1, :]
|
||||
1, 1, 1, // [2, 0, :]
|
||||
2, 2, 2, // [2, 1, :]
|
||||
1, 1, 1, // [3, 0, :]
|
||||
2, 2, 2 // [3, 1, :]
|
||||
};
|
||||
Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
|
||||
Mat dst;
|
||||
broadcast(_src, shape, dst);
|
||||
fn_verify(ref, dst);
|
||||
}
|
||||
|
||||
{
|
||||
Mat _src;
|
||||
src.convertTo(_src, CV_32F);
|
||||
std::vector<int> shape{1, 1, 2, 1}; // {2, 1}
|
||||
std::vector<float> data_ref{
|
||||
1.f, // [0, 0, 0, 0]
|
||||
2.f, // [0, 0, 1, 0]
|
||||
};
|
||||
Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
|
||||
Mat dst;
|
||||
broadcast(_src, shape, dst);
|
||||
fn_verify(ref, dst);
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<int> _shape_src{2, 3, 4};
|
||||
std::vector<float> _data_src{
|
||||
1.f, 2.f, 3.f, 4.f, // [0, 0, :]
|
||||
2.f, 3.f, 4.f, 5.f, // [0, 1, :]
|
||||
3.f, 4.f, 5.f, 6.f, // [0, 2, :]
|
||||
|
||||
4.f, 5.f, 6.f, 7.f, // [1, 0, :]
|
||||
5.f, 6.f, 7.f, 8.f, // [1, 1, :]
|
||||
6.f, 7.f, 8.f, 9.f, // [1, 2, :]
|
||||
};
|
||||
Mat _src(static_cast<int>(_shape_src.size()), _shape_src.data(), CV_32FC1, _data_src.data());
|
||||
|
||||
std::vector<int> shape{2, 1, 2, 3, 4};
|
||||
std::vector<float> data_ref{
|
||||
1.f, 2.f, 3.f, 4.f, // [0, 0, 0, 0, :]
|
||||
2.f, 3.f, 4.f, 5.f, // [0, 0, 0, 1, :]
|
||||
3.f, 4.f, 5.f, 6.f, // [0, 0, 0, 2, :]
|
||||
|
||||
4.f, 5.f, 6.f, 7.f, // [0, 0, 1, 0, :]
|
||||
5.f, 6.f, 7.f, 8.f, // [0, 0, 1, 1, :]
|
||||
6.f, 7.f, 8.f, 9.f, // [0, 0, 1, 2, :]
|
||||
|
||||
1.f, 2.f, 3.f, 4.f, // [1, 0, 0, 0, :]
|
||||
2.f, 3.f, 4.f, 5.f, // [1, 0, 0, 1, :]
|
||||
3.f, 4.f, 5.f, 6.f, // [1, 0, 0, 2, :]
|
||||
|
||||
4.f, 5.f, 6.f, 7.f, // [1, 0, 1, 0, :]
|
||||
5.f, 6.f, 7.f, 8.f, // [1, 0, 1, 1, :]
|
||||
6.f, 7.f, 8.f, 9.f, // [1, 0, 1, 2, :]
|
||||
};
|
||||
Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
|
||||
Mat dst;
|
||||
broadcast(_src, shape, dst);
|
||||
fn_verify(ref, dst);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Core_minMaxIdx, regression_9207_2)
|
||||
{
|
||||
const int rows = 13;
|
||||
|
@ -259,7 +259,7 @@ TEST_P (CountNonZeroND, ndim)
|
||||
const int ONE_SIZE = 5;
|
||||
|
||||
vector<int> sizes(dims);
|
||||
fill(sizes.begin(), sizes.end(), ONE_SIZE);
|
||||
std::fill(sizes.begin(), sizes.end(), ONE_SIZE);
|
||||
|
||||
Mat data(sizes, CV_MAKETYPE(type, 1));
|
||||
data = 0;
|
||||
|
@ -1475,12 +1475,15 @@ template<typename R> struct TheTest
|
||||
TheTest & test_float_math()
|
||||
{
|
||||
typedef typename V_RegTraits<R>::round_reg Ri;
|
||||
Data<R> data1, data2, data3;
|
||||
Data<R> data1, data1_border, data2, data3;
|
||||
// See https://github.com/opencv/opencv/issues/24213
|
||||
data1_border *= 0.5;
|
||||
data1 *= 1.1;
|
||||
data2 += 10;
|
||||
R a1 = data1, a2 = data2, a3 = data3;
|
||||
R a1 = data1, a1_border = data1_border, a2 = data2, a3 = data3;
|
||||
|
||||
Data<Ri> resB = v_round(a1),
|
||||
resB_border = v_round(a1_border),
|
||||
resC = v_trunc(a1),
|
||||
resD = v_floor(a1),
|
||||
resE = v_ceil(a1);
|
||||
@ -1493,6 +1496,7 @@ template<typename R> struct TheTest
|
||||
{
|
||||
SCOPED_TRACE(cv::format("i=%d", i));
|
||||
EXPECT_EQ(cvRound(data1[i]), resB[i]);
|
||||
EXPECT_EQ(cvRound(data1_border[i]), resB_border[i]);
|
||||
EXPECT_EQ((typename VTraits<Ri>::lane_type)data1[i], resC[i]);
|
||||
EXPECT_EQ(cvFloor(data1[i]), resD[i]);
|
||||
EXPECT_EQ(cvCeil(data1[i]), resE[i]);
|
||||
|
@ -58,11 +58,6 @@ endif()
|
||||
ocv_cmake_hook_append(INIT_MODULE_SOURCES_opencv_dnn "${CMAKE_CURRENT_LIST_DIR}/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake")
|
||||
|
||||
|
||||
if(HAVE_TENGINE)
|
||||
ocv_target_compile_definitions(${the_module} PRIVATE "HAVE_TENGINE=1")
|
||||
endif()
|
||||
|
||||
|
||||
if(MSVC)
|
||||
add_definitions( -D_CRT_SECURE_NO_WARNINGS=1 )
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244 /wd4267 /wd4018 /wd4355 /wd4800 /wd4251 /wd4996 /wd4146
|
||||
@ -172,11 +167,6 @@ else()
|
||||
set(sources_options ${sources_options} EXCLUDE_CUDA)
|
||||
endif()
|
||||
|
||||
if(HAVE_TENGINE)
|
||||
list(APPEND include_dirs ${TENGINE_INCLUDE_DIRS})
|
||||
list(APPEND libs -Wl,--whole-archive ${TENGINE_LIBRARIES} -Wl,--no-whole-archive)
|
||||
endif()
|
||||
|
||||
if(HAVE_TIMVX)
|
||||
list(APPEND include_dirs ${TIMVX_INCLUDE_DIR})
|
||||
list(APPEND libs -Wl,--whole-archive ${TIMVX_LIBRARY} -Wl,--no-whole-archive)
|
||||
@ -237,6 +227,10 @@ if(TARGET ocv.3rdparty.openvino AND OPENCV_DNN_OPENVINO)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(OPENCV_DNN_BACKEND_DEFAULT "" CACHE STRING "Default backend used by the DNN module (DNN_BACKEND_OPENCV if empty)")
|
||||
if(OPENCV_DNN_BACKEND_DEFAULT)
|
||||
ocv_append_source_file_compile_definitions("${CMAKE_CURRENT_LIST_DIR}/src/dnn_params.cpp" "OPENCV_DNN_BACKEND_DEFAULT=${OPENCV_DNN_BACKEND_DEFAULT}")
|
||||
endif()
|
||||
|
||||
ocv_install_used_external_targets(${libs} ${dnn_runtime_libs})
|
||||
|
||||
|
@ -69,9 +69,7 @@ CV__DNN_INLINE_NS_BEGIN
|
||||
*/
|
||||
enum Backend
|
||||
{
|
||||
//! DNN_BACKEND_DEFAULT equals to DNN_BACKEND_INFERENCE_ENGINE if
|
||||
//! OpenCV is built with Intel OpenVINO or
|
||||
//! DNN_BACKEND_OPENCV otherwise.
|
||||
//! DNN_BACKEND_DEFAULT equals to OPENCV_DNN_BACKEND_DEFAULT, which can be defined using CMake or a configuration parameter
|
||||
DNN_BACKEND_DEFAULT = 0,
|
||||
DNN_BACKEND_HALIDE,
|
||||
DNN_BACKEND_INFERENCE_ENGINE, //!< Intel OpenVINO computational backend
|
||||
@ -688,9 +686,6 @@ CV__DNN_INLINE_NS_BEGIN
|
||||
* @brief Ask network to use specific computation backend where it supported.
|
||||
* @param[in] backendId backend identifier.
|
||||
* @see Backend
|
||||
*
|
||||
* If OpenCV is compiled with Intel's Inference Engine library, DNN_BACKEND_DEFAULT
|
||||
* means DNN_BACKEND_INFERENCE_ENGINE. Otherwise it equals to DNN_BACKEND_OPENCV.
|
||||
*/
|
||||
CV_WRAP void setPreferableBackend(int backendId);
|
||||
|
||||
|
@ -191,10 +191,10 @@ class dnn_test(NewOpenCVTests):
|
||||
|
||||
def test_model(self):
|
||||
img_path = self.find_dnn_file("dnn/street.png")
|
||||
weights = self.find_dnn_file("dnn/MobileNetSSD_deploy.caffemodel", required=False)
|
||||
config = self.find_dnn_file("dnn/MobileNetSSD_deploy.prototxt", required=False)
|
||||
weights = self.find_dnn_file("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", required=False)
|
||||
config = self.find_dnn_file("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", required=False)
|
||||
if weights is None or config is None:
|
||||
raise unittest.SkipTest("Missing DNN test files (dnn/MobileNetSSD_deploy.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
|
||||
raise unittest.SkipTest("Missing DNN test files (dnn/MobileNetSSD_deploy_19e3ec3.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
|
||||
|
||||
frame = cv.imread(img_path)
|
||||
model = cv.dnn_DetectionModel(weights, config)
|
||||
|
@ -101,8 +101,8 @@ PERF_TEST(SqueezeNet_v1_1_caffe, CaffePerfTest)
|
||||
|
||||
PERF_TEST(MobileNet_SSD, CaffePerfTest)
|
||||
{
|
||||
caffe::Net<float>* net = initNet("dnn/MobileNetSSD_deploy.prototxt",
|
||||
"dnn/MobileNetSSD_deploy.caffemodel");
|
||||
caffe::Net<float>* net = initNet("dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
|
||||
"dnn/MobileNetSSD_deploy_19e3ec3.caffemodel");
|
||||
TEST_CYCLE() net->Forward();
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
@ -678,7 +678,6 @@ PERF_TEST_P_(Layer_FullyConnected, fc)
|
||||
lp.set("axis", input.dims - 1);
|
||||
lp.set("is_matmul", weights.dims > 2);
|
||||
lp.set("bias_term", false);
|
||||
lp.set("transB", true);
|
||||
lp.set("num_output", (int)weights.total(0, weights.dims - 1));
|
||||
lp.blobs.resize(1, weights);
|
||||
|
||||
|
@ -141,7 +141,7 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
|
||||
{
|
||||
if (backend == DNN_BACKEND_HALIDE)
|
||||
throw SkipTestException("");
|
||||
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", "",
|
||||
processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt", "",
|
||||
Mat(cv::Size(300, 300), CV_32FC3));
|
||||
}
|
||||
|
||||
|
@ -36,7 +36,11 @@ bool getParam_DNN_OPENCL_ALLOW_ALL_DEVICES()
|
||||
int getParam_DNN_BACKEND_DEFAULT()
|
||||
{
|
||||
static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSizeT("OPENCV_DNN_BACKEND_DEFAULT",
|
||||
#ifdef OPENCV_DNN_BACKEND_DEFAULT
|
||||
(size_t)OPENCV_DNN_BACKEND_DEFAULT
|
||||
#else
|
||||
(size_t)DNN_BACKEND_OPENCV
|
||||
#endif
|
||||
);
|
||||
return PARAM_DNN_BACKEND_DEFAULT;
|
||||
}
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "precomp.hpp"
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/core/utils/logger.hpp>
|
||||
|
||||
|
||||
namespace cv {
|
||||
@ -100,15 +101,29 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
|
||||
images_.getMatVector(images);
|
||||
CV_Assert(!images.empty());
|
||||
|
||||
int nch = images[0].channels();
|
||||
Scalar scalefactor = param.scalefactor;
|
||||
|
||||
if (param.ddepth == CV_8U)
|
||||
{
|
||||
CV_Assert(scalefactor == Scalar::all(1.0) && "Scaling is not supported for CV_8U blob depth");
|
||||
CV_Assert(param.scalefactor == Scalar::all(1.0) && "Scaling is not supported for CV_8U blob depth");
|
||||
CV_Assert(param.mean == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
|
||||
}
|
||||
|
||||
int nch = images[0].channels();
|
||||
Scalar scalefactor = param.scalefactor;
|
||||
Scalar mean = param.mean;
|
||||
|
||||
if (param.swapRB)
|
||||
{
|
||||
if (nch > 2)
|
||||
{
|
||||
std::swap(mean[0], mean[2]);
|
||||
std::swap(scalefactor[0], scalefactor[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_LOG_WARNING(NULL, "Red/blue color swapping requires at least three image channels.");
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < images.size(); i++)
|
||||
{
|
||||
Size imgSize = images[i].size();
|
||||
@ -126,9 +141,7 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
|
||||
size);
|
||||
images[i] = images[i](crop);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (param.paddingmode == DNN_PMODE_LETTERBOX)
|
||||
else if (param.paddingmode == DNN_PMODE_LETTERBOX)
|
||||
{
|
||||
float resizeFactor = std::min(size.width / (float)imgSize.width,
|
||||
size.height / (float)imgSize.height);
|
||||
@ -143,17 +156,11 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
|
||||
copyMakeBorder(images[i], images[i], top, bottom, left, right, BORDER_CONSTANT);
|
||||
}
|
||||
else
|
||||
{
|
||||
resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
|
||||
}
|
||||
}
|
||||
|
||||
Scalar mean = param.mean;
|
||||
if (param.swapRB)
|
||||
{
|
||||
std::swap(mean[0], mean[2]);
|
||||
std::swap(scalefactor[0], scalefactor[2]);
|
||||
}
|
||||
|
||||
if (images[i].depth() == CV_8U && param.ddepth == CV_32F)
|
||||
images[i].convertTo(images[i], CV_32F);
|
||||
|
||||
@ -220,18 +227,22 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
|
||||
CV_Assert(image.depth() == blob_.depth());
|
||||
CV_Assert(image.channels() == image0.channels());
|
||||
CV_Assert(image.size() == image0.size());
|
||||
if (param.swapRB)
|
||||
if (nch > 2 && param.swapRB)
|
||||
{
|
||||
Mat tmpRB;
|
||||
cvtColor(image, tmpRB, COLOR_BGR2RGB);
|
||||
tmpRB.copyTo(Mat(tmpRB.rows, tmpRB.cols, subMatType, blob.ptr((int)i, 0)));
|
||||
}
|
||||
else
|
||||
{
|
||||
image.copyTo(Mat(image.rows, image.cols, subMatType, blob.ptr((int)i, 0)));
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Error(Error::StsUnsupportedFormat, "Unsupported data layout in blobFromImagesWithParams function.");
|
||||
}
|
||||
}
|
||||
|
||||
void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
|
||||
|
@ -383,11 +383,17 @@ public:
|
||||
|
||||
#endif // OpenVINO >= 2022.1
|
||||
|
||||
InfEngineNgraphNode::InfEngineNgraphNode(std::shared_ptr<ngraph::Node>&& _node)
|
||||
: BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(std::move(_node)) {}
|
||||
InfEngineNgraphNode::InfEngineNgraphNode(ngraph::Output<ngraph::Node>&& _node)
|
||||
: BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(std::move(_node)) {
|
||||
CV_Assert(node.get_node());
|
||||
CV_Assert(node.get_node_shared_ptr());
|
||||
}
|
||||
|
||||
InfEngineNgraphNode::InfEngineNgraphNode(const std::shared_ptr<ngraph::Node>& _node)
|
||||
: BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(_node) {}
|
||||
InfEngineNgraphNode::InfEngineNgraphNode(const ngraph::Output<ngraph::Node>& _node)
|
||||
: BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(_node) {
|
||||
CV_Assert(node.get_node());
|
||||
CV_Assert(node.get_node_shared_ptr());
|
||||
}
|
||||
|
||||
InfEngineNgraphNode::InfEngineNgraphNode(const std::vector<Ptr<BackendNode> >& nodes,
|
||||
Ptr<Layer>& cvLayer_, std::vector<Mat*>& inputs,
|
||||
@ -420,7 +426,7 @@ InfEngineNgraphNode::InfEngineNgraphNode(const std::vector<Ptr<BackendNode> >& n
|
||||
}
|
||||
|
||||
void InfEngineNgraphNode::setName(const std::string& name) {
|
||||
node->set_friendly_name(name);
|
||||
node.get_node()->set_friendly_name(name);
|
||||
}
|
||||
|
||||
InfEngineNgraphNet::InfEngineNgraphNet(detail::NetImplBase& netImpl)
|
||||
@ -441,8 +447,7 @@ InfEngineNgraphNet::InfEngineNgraphNet(detail::NetImplBase& netImpl, InferenceEn
|
||||
void InfEngineNgraphNet::addOutput(const Ptr<InfEngineNgraphNode>& node)
|
||||
{
|
||||
CV_Assert(node);
|
||||
CV_Assert(node->node);
|
||||
const std::string& name = node->node->get_friendly_name();
|
||||
const std::string& name = node->node.get_node()->get_friendly_name();
|
||||
requestedOutputs.insert({name, node.get()});
|
||||
}
|
||||
|
||||
@ -458,7 +463,7 @@ void InfEngineNgraphNet::createNet(Target targetId) {
|
||||
CV_Assert(output_node_it->second);
|
||||
auto out = std::make_shared<ngraph::op::Result>(output_node_it->second->node);
|
||||
#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
|
||||
out->set_friendly_name(output_node_it->first + (output_node_it->second->node->get_output_size() == 1 ? "" : ".0"));
|
||||
out->set_friendly_name(output_node_it->first + (output_node_it->second->node.get_node()->get_output_size() == 1 ? "" : ".0"));
|
||||
#endif
|
||||
outs.push_back(out);
|
||||
}
|
||||
|
@ -93,13 +93,13 @@ public:
|
||||
std::vector<Mat*>& inputs, std::vector<Mat>& outputs,
|
||||
std::vector<Mat>& internals);
|
||||
|
||||
InfEngineNgraphNode(std::shared_ptr<ngraph::Node>&& _node);
|
||||
InfEngineNgraphNode(const std::shared_ptr<ngraph::Node>& _node);
|
||||
InfEngineNgraphNode(ngraph::Output<ngraph::Node>&& _node);
|
||||
InfEngineNgraphNode(const ngraph::Output<ngraph::Node>& _node);
|
||||
|
||||
void setName(const std::string& name);
|
||||
|
||||
// Inference Engine network object that allows to obtain the outputs of this layer.
|
||||
std::shared_ptr<ngraph::Node> node;
|
||||
ngraph::Output<ngraph::Node> node;
|
||||
Ptr<InfEngineNgraphNet> net;
|
||||
Ptr<dnn::Layer> cvLayer;
|
||||
};
|
||||
|
@ -457,7 +457,7 @@ public:
|
||||
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
|
||||
{
|
||||
auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
|
||||
std::vector<size_t> shape(ieInpNode.get_shape().size(), 1);
|
||||
shape[1] = weights_.total();
|
||||
auto weight = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), weights_.data);
|
||||
auto bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), bias_.data);
|
||||
|
@ -148,7 +148,7 @@ public:
|
||||
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
|
||||
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
|
||||
{
|
||||
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
ngraph::OutputVector inp{ieInpNode};
|
||||
auto blank = std::make_shared<ngraph::op::Concat>(inp, 0);
|
||||
return Ptr<BackendNode>(new InfEngineNgraphNode(blank));
|
||||
|
@ -392,7 +392,7 @@ public:
|
||||
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
|
||||
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
|
||||
{
|
||||
const int numDims = nodes[0].dynamicCast<InfEngineNgraphNode>()->node->get_shape().size();
|
||||
const int numDims = nodes[0].dynamicCast<InfEngineNgraphNode>()->node.get_shape().size();
|
||||
const int cAxis = normalize_axis(axis, numDims);
|
||||
std::vector<size_t> maxDims(numDims, 0);
|
||||
|
||||
@ -403,7 +403,7 @@ public:
|
||||
auto inp = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
inp_nodes.push_back(inp);
|
||||
|
||||
std::vector<size_t> inpShape = inp->get_shape();
|
||||
std::vector<size_t> inpShape = inp.get_shape();
|
||||
for (int i = 0; i < numDims; ++i)
|
||||
maxDims[i] = std::max(maxDims[i], inpShape[i]);
|
||||
}
|
||||
|
@ -62,9 +62,6 @@
|
||||
#include "opencl_kernels_dnn.hpp"
|
||||
using namespace cv::dnn::ocl4dnn;
|
||||
#endif
|
||||
#ifdef HAVE_TENGINE
|
||||
#include "../tengine4dnn/include/tengine_graph_convolution.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
#include "../cuda4dnn/primitives/convolution.hpp"
|
||||
@ -267,10 +264,6 @@ public:
|
||||
float power;
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_TENGINE
|
||||
teng_graph_t tengine_graph;
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
cuda4dnn::ConvolutionConfiguration::FusionMode cudaFusionMode;
|
||||
cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType;
|
||||
@ -289,20 +282,8 @@ public:
|
||||
#ifdef HAVE_CUDA
|
||||
cudaFusionMode = cuda4dnn::ConvolutionConfiguration::FusionMode::NONE;
|
||||
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY;
|
||||
#endif
|
||||
#ifdef HAVE_TENGINE
|
||||
tengine_graph=NULL;
|
||||
#endif
|
||||
}
|
||||
#ifdef HAVE_TENGINE
|
||||
~ConvolutionLayerImpl()
|
||||
{
|
||||
if(NULL != tengine_graph )
|
||||
{
|
||||
tengine_release(tengine_graph);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
|
||||
{
|
||||
@ -466,13 +447,6 @@ public:
|
||||
for(int i = 0; i < numOutput; i++ )
|
||||
biasvec[i] = biasMat.at<float>(i);
|
||||
}
|
||||
#ifdef HAVE_TENGINE
|
||||
if(NULL != tengine_graph )
|
||||
{
|
||||
tengine_release(tengine_graph);
|
||||
tengine_graph = NULL ;
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_OPENCL
|
||||
convolutionOp.release();
|
||||
#endif
|
||||
@ -848,13 +822,13 @@ public:
|
||||
CV_Assert(!blobs.empty());
|
||||
CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
|
||||
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
std::vector<size_t> dims = ieInpNode->get_shape();
|
||||
std::vector<size_t> dims = ieInpNode.get_shape();
|
||||
CV_Check(dims.size(), dims.size() >= 3 && dims.size() <= 5, "");
|
||||
std::shared_ptr<ngraph::Node> ieWeights = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
|
||||
ngraph::Output<ngraph::Node> ieWeights;
|
||||
if (nodes.size() > 1)
|
||||
CV_Assert(ieWeights); // dynamic_cast should not fail
|
||||
ieWeights = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
const int inpCn = dims[1];
|
||||
const int inpGroupCn = nodes.size() > 1 ? ieWeights->get_shape()[1] : blobs[0].size[1];
|
||||
const int inpGroupCn = nodes.size() > 1 ? ieWeights.get_shape()[1] : blobs[0].size[1];
|
||||
const int group = inpCn / inpGroupCn;
|
||||
|
||||
std::vector<size_t> kernel_shape;
|
||||
@ -1095,7 +1069,7 @@ public:
|
||||
config.pads = pads;
|
||||
config.stride = stride;
|
||||
config.dilation = dilation;
|
||||
if (inputs[0].dims != 4 && inputs[0].dims != umat_blobs[0].dims)
|
||||
if (inputs[0].dims != 4 && inputs[0].dims != (blobs.empty() ? umat_blobs[0].dims : blobs[0].dims))
|
||||
{
|
||||
static bool bypassCheck = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_CONVOLUTION_IGNORE_INPUT_DIMS_4_CHECK", false);
|
||||
if (!bypassCheck)
|
||||
@ -1107,7 +1081,7 @@ public:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
config.group = inputs[0].size[1] / umat_blobs[0].size[1];
|
||||
config.group = inputs[0].size[1] / (blobs.empty() ? umat_blobs[0].size[1] : blobs[0].size[1]);
|
||||
if (config.group < 1) // config.group == 0 causes div by zero in ocl4dnn code
|
||||
{
|
||||
CV_LOG_WARNING(NULL, "DNN/OpenCL: Unsupported config.group=" << config.group
|
||||
@ -1305,65 +1279,6 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_TENGINE
|
||||
bool tengine_ret = false;
|
||||
|
||||
std::vector<Mat> teng_in, teng_out;
|
||||
inputs_arr.getMatVector(teng_in);
|
||||
outputs_arr.getMatVector(teng_out);
|
||||
|
||||
int inch = teng_in[0].size[1]; // inch
|
||||
int in_h = teng_in[0].size[2]; // in_h
|
||||
int in_w = teng_in[0].size[3]; // in_w
|
||||
|
||||
int out_b = teng_out[0].size[0]; // out batch size
|
||||
int outch = teng_out[0].size[1]; // outch
|
||||
int out_h = teng_out[0].size[2]; // out_h
|
||||
int out_w = teng_out[0].size[3]; // out_w
|
||||
|
||||
float *input_ = teng_in[0].ptr<float>();
|
||||
float *output_ = teng_out[0].ptr<float>();
|
||||
float *kernel_ = weightsMat.ptr<float>();
|
||||
float *teg_bias = &biasvec[0];
|
||||
|
||||
int nstripes = std::max(getNumThreads(), 1);
|
||||
|
||||
/* tengine_init will run when first time. */
|
||||
if(NULL == tengine_graph)
|
||||
{
|
||||
// pads_begin: 0 - pad_top, 1 - pad_left
|
||||
// pads_end: 0 - pad_bottom, 1 - pad_right
|
||||
// pad_h0: pad_top, pad_h1: pad_bottom
|
||||
// pad_w0: pad_left, pad_w1: pad_right
|
||||
tengine_graph = tengine_init(name.c_str(), input_, inch, ngroups, in_h, in_w,
|
||||
output_, out_b, outch, out_h, out_w,
|
||||
kernel_, kernel_size.size(), kernel.height, kernel.width,
|
||||
teg_bias, stride.height, stride.width,
|
||||
pads_begin[0], pads_end[0], pads_begin[1], pads_end[1], dilation.height, dilation.width,
|
||||
weightsMat.step1(), padMode, tengine_graph, nstripes);
|
||||
// printf("Init(%s): input=%p(%d %d %d %d ),output=%p(%d %d %d %d ),kernel=%p(%ld %d %d ), bias=%p ,"
|
||||
// "stride(%d %d), pad(%d %d %d %d), dilation(%d %d) ,weightsMat=%ld, padMode=%s ,tengine_graph = %p \n",
|
||||
// name.c_str(),input_, inch, ngroups, in_h, in_w,
|
||||
// output_, out_b, outch, out_h, out_w,
|
||||
// kernel_, kernel_size.size(), kernel.height, kernel.width,
|
||||
// teg_bias, stride.height, stride.width,
|
||||
// pads_begin[0], pads_end[0], pads_begin[1], pads_end[1], dilation.height, dilation.width,
|
||||
// weightsMat.step1(), padMode.c_str() ,tengine_graph);
|
||||
}
|
||||
if(NULL != tengine_graph)
|
||||
{
|
||||
tengine_ret = tengine_forward(tengine_graph);
|
||||
}
|
||||
/* activation */
|
||||
if((true == tengine_ret) && activ )
|
||||
{
|
||||
int out_cstep = out_h * out_w; // out_cstep
|
||||
|
||||
ActivationLayer* activ_ = activ.get();
|
||||
activ_->forwardSlice(output_, output_, out_cstep, out_cstep, 0, outch);
|
||||
}
|
||||
if(false == tengine_ret)
|
||||
#endif
|
||||
{
|
||||
int nstripes = std::max(getNumThreads(), 1);
|
||||
int conv_dim = CONV_2D;
|
||||
|
@ -14,7 +14,7 @@
|
||||
#define CONV_NR_FP32 28
|
||||
|
||||
// The FP16 can only be supported by ARM64 and with FP16 FMA supported.
|
||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // check FP16 FMA.
|
||||
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && CV_FP16 // check FP16 FMA.
|
||||
#define CONV_ARM_FP16 1
|
||||
#endif
|
||||
|
||||
|
@ -133,7 +133,7 @@ public:
|
||||
auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
auto rois = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
|
||||
auto rois_shape = rois->get_shape();
|
||||
auto rois_shape = rois.get_shape();
|
||||
std::vector<int64_t> dims(rois_shape.begin(), rois_shape.end()), offsets(4, 0);
|
||||
offsets[3] = 2;
|
||||
dims[3] = 7;
|
||||
|
@ -490,7 +490,7 @@ struct ReLUFunctor : public BaseFunctor
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
if (slope) {
|
||||
auto param = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &slope);
|
||||
@ -674,7 +674,7 @@ struct ReLU6Functor : public BaseFunctor
|
||||
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
return std::make_shared<ngraph::op::Clamp>(node, minValue, maxValue);
|
||||
}
|
||||
@ -796,7 +796,7 @@ struct BaseDefaultFunctor : public BaseFunctor
|
||||
#endif // HAVE_CANN
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
CV_Error(Error::StsNotImplemented, "");
|
||||
}
|
||||
@ -929,7 +929,7 @@ struct TanHFunctor : public BaseDefaultFunctor<TanHFunctor>
|
||||
#endif // HAVE_CANN
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
return std::make_shared<ngraph::op::Tanh>(node);
|
||||
}
|
||||
@ -998,7 +998,7 @@ struct SwishFunctor : public BaseDefaultFunctor<SwishFunctor>
|
||||
#endif // HAVE_CANN
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
auto sigmoid = std::make_shared<ngraph::op::Sigmoid>(node);
|
||||
return std::make_shared<ngraph::op::v1::Multiply>(node, sigmoid);
|
||||
@ -1074,7 +1074,7 @@ struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
|
||||
#endif // HAVE_CANN
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
float one = 1.0f;
|
||||
auto constant = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &one);
|
||||
@ -1157,7 +1157,7 @@ struct SigmoidFunctor : public BaseDefaultFunctor<SigmoidFunctor>
|
||||
#endif // HAVE_CANN
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
return std::make_shared<ngraph::op::Sigmoid>(node);
|
||||
}
|
||||
@ -1237,7 +1237,7 @@ struct ELUFunctor : public BaseDefaultFunctor<ELUFunctor>
|
||||
#endif // HAVE_CANN
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
return std::make_shared<ngraph::op::Elu>(node, alpha);
|
||||
}
|
||||
@ -1307,7 +1307,7 @@ struct AbsValFunctor : public BaseDefaultFunctor<AbsValFunctor>
|
||||
#endif // HAVE_CANN
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
float coeff = -0.999999f;
|
||||
// float coeff = preferableTarget == DNN_TARGET_MYRIAD ? -0.999f : -0.999999f;
|
||||
@ -1603,7 +1603,7 @@ struct SqrtFunctor : public BaseDefaultFunctor<SqrtFunctor>
|
||||
#endif // HAVE_HALIDE
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
return std::make_shared<ngraph::op::v0::Sqrt>(node);
|
||||
}
|
||||
@ -2329,7 +2329,7 @@ struct PowerFunctor : public BaseFunctor
|
||||
#endif // HAVE_CANN
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
auto scale_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
|
||||
ngraph::Shape{1}, &scale);
|
||||
@ -2439,7 +2439,7 @@ struct ExpFunctor : public BaseDefaultFunctor<ExpFunctor>
|
||||
#endif // HAVE_HALIDE
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
auto scale_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
|
||||
ngraph::Shape{1}, &normScale);
|
||||
@ -2598,7 +2598,7 @@ struct ChannelsPReLUFunctor : public BaseFunctor
|
||||
#endif // HAVE_CANN
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
const size_t numChannels = scale.total();
|
||||
auto slope = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numChannels}, scale.data);
|
||||
@ -2678,7 +2678,7 @@ struct PReLUFunctor : public ChannelsPReLUFunctor
|
||||
}
|
||||
|
||||
#ifdef HAVE_DNN_NGRAPH
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
|
||||
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
|
||||
{
|
||||
auto shape = getShape<size_t>(scale);
|
||||
auto slope = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, scale.ptr<float>());
|
||||
|
@ -896,12 +896,14 @@ public:
|
||||
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
|
||||
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
|
||||
{
|
||||
CV_Assert(nodes.size() >= 2);
|
||||
auto curr_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
if (!coeffs.empty()) {
|
||||
auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[0]);
|
||||
curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
|
||||
std::shared_ptr<ngraph::Node> res;
|
||||
for (size_t i = 1; i < nodes.size(); i++)
|
||||
{
|
||||
auto next_node = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
@ -910,15 +912,16 @@ public:
|
||||
next_node = std::make_shared<ngraph::op::v1::Multiply>(next_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
switch (op) {
|
||||
case SUM: curr_node = std::make_shared<ngraph::op::v1::Add>(curr_node, next_node); break;
|
||||
case PROD: curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, next_node); break;
|
||||
case DIV: curr_node = std::make_shared<ngraph::op::v1::Divide>(curr_node, next_node); break;
|
||||
case MAX: curr_node = std::make_shared<ngraph::op::v1::Maximum>(curr_node, next_node); break;
|
||||
case MIN: curr_node = std::make_shared<ngraph::op::v1::Minimum>(curr_node, next_node); break;
|
||||
case SUM: res = std::make_shared<ngraph::op::v1::Add>(curr_node, next_node); break;
|
||||
case PROD: res = std::make_shared<ngraph::op::v1::Multiply>(curr_node, next_node); break;
|
||||
case DIV: res = std::make_shared<ngraph::op::v1::Divide>(curr_node, next_node); break;
|
||||
case MAX: res = std::make_shared<ngraph::op::v1::Maximum>(curr_node, next_node); break;
|
||||
case MIN: res = std::make_shared<ngraph::op::v1::Minimum>(curr_node, next_node); break;
|
||||
default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
|
||||
}
|
||||
curr_node = res;
|
||||
}
|
||||
return Ptr<BackendNode>(new InfEngineNgraphNode(curr_node));
|
||||
return Ptr<BackendNode>(new InfEngineNgraphNode(res));
|
||||
}
|
||||
#endif // HAVE_DNN_NGRAPH
|
||||
|
||||
|
@ -209,7 +209,7 @@ public:
|
||||
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
|
||||
{
|
||||
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
std::vector<size_t> dims = ieInpNode->get_shape();
|
||||
std::vector<size_t> dims = ieInpNode.get_shape();
|
||||
|
||||
int numAxes = dims.size();
|
||||
int startAxis = normalize_axis(_startAxis, numAxes);
|
||||
|
@ -803,7 +803,7 @@ public:
|
||||
}
|
||||
else
|
||||
{
|
||||
std::vector<int> shape(1 + normalize_axis(axis, ieInpNode->get_shape().size()), 0);
|
||||
std::vector<int> shape(1 + normalize_axis(axis, ieInpNode.get_shape().size()), 0);
|
||||
shape[shape.size() - 1] = -1;
|
||||
auto inp = std::make_shared<ngraph::op::v1::Reshape>(
|
||||
ieInpNode,
|
||||
|
@ -480,7 +480,7 @@ public:
|
||||
if (type != SPATIAL_NRM) {
|
||||
axes = {1};
|
||||
} else {
|
||||
axes.resize(ieInpNode->get_shape().size() - 2);
|
||||
axes.resize(ieInpNode.get_shape().size() - 2);
|
||||
std::iota(axes.begin(), axes.end(), 2);
|
||||
}
|
||||
auto ngraph_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes.size()}, axes.data());
|
||||
|
@ -194,7 +194,7 @@ public:
|
||||
std::vector<MatShape> inpShapes(nodes.size());
|
||||
std::vector<MatShape> outShapes, internals;
|
||||
for (int i = 0; i < nodes.size(); ++i) {
|
||||
std::vector<size_t> shape = nodes[i].dynamicCast<InfEngineNgraphNode>()->node->get_shape();
|
||||
std::vector<size_t> shape = nodes[i].dynamicCast<InfEngineNgraphNode>()->node.get_shape();
|
||||
inpShapes[i] = std::vector<int>(shape.begin(), shape.end());
|
||||
}
|
||||
getMemoryShapes(inpShapes, 1, outShapes, internals);
|
||||
@ -213,7 +213,7 @@ public:
|
||||
std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{1}, &newShape),
|
||||
true
|
||||
);
|
||||
if (indices->get_element_type() != ngraph::element::i32 && indices->get_element_type() != ngraph::element::i64) {
|
||||
if (indices.get_element_type() != ngraph::element::i32 && indices.get_element_type() != ngraph::element::i64) {
|
||||
indices = std::make_shared<ngraph::op::Convert>(indices, ngraph::element::i64);
|
||||
}
|
||||
|
||||
|
@ -390,7 +390,7 @@ public:
|
||||
auto mvn = std::make_shared<ngraph::op::MVN>(ieInpNode, acrossChannels, normVariance, eps);
|
||||
#else
|
||||
int64_t start_axis = acrossChannels ? 1 : 2;
|
||||
std::vector<int64_t> axes_v(ieInpNode->get_shape().size() - start_axis);
|
||||
std::vector<int64_t> axes_v(ieInpNode.get_shape().size() - start_axis);
|
||||
std::iota(axes_v.begin(), axes_v.end(), start_axis);
|
||||
auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_v.size()}, axes_v.data());
|
||||
auto mvn = std::make_shared<ngraph::op::v6::MVN>(ieInpNode, axes, normVariance, eps, ngraph::op::MVNEpsMode::INSIDE_SQRT);
|
||||
|
@ -900,12 +900,12 @@ public:
|
||||
auto& inp0 = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
auto& inp1 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
|
||||
if (inp0->get_element_type() != inp1->get_element_type()) {
|
||||
if (inp0.get_element_type() != inp1.get_element_type()) {
|
||||
auto dtype = preferableTarget == DNN_TARGET_OPENCL_FP16 || preferableTarget == DNN_TARGET_MYRIAD ?
|
||||
ngraph::element::f16 : ngraph::element::f32;
|
||||
if (inp0->get_element_type() != dtype)
|
||||
if (inp0.get_element_type() != dtype)
|
||||
inp0 = std::make_shared<ngraph::op::v0::Convert>(inp0, dtype);
|
||||
if (inp1->get_element_type() != dtype)
|
||||
if (inp1.get_element_type() != dtype)
|
||||
inp1 = std::make_shared<ngraph::op::v0::Convert>(inp1, dtype);
|
||||
}
|
||||
|
||||
|
@ -273,21 +273,21 @@ public:
|
||||
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
|
||||
{
|
||||
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
const size_t batch = ieInpNode->get_shape()[0];
|
||||
const size_t numChannels = ieInpNode->get_shape()[1];
|
||||
const size_t batch = ieInpNode.get_shape()[0];
|
||||
const size_t numChannels = ieInpNode.get_shape()[1];
|
||||
|
||||
std::vector<int64_t> axes_data;
|
||||
if (!acrossSpatial) {
|
||||
axes_data.push_back(1);
|
||||
} else {
|
||||
axes_data.resize(ieInpNode->get_shape().size() - 1);
|
||||
axes_data.resize(ieInpNode.get_shape().size() - 1);
|
||||
std::iota(axes_data.begin(), axes_data.end(), 1);
|
||||
}
|
||||
auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_data.size()}, axes_data);
|
||||
auto norm = std::make_shared<ngraph::op::v0::NormalizeL2>(ieInpNode, axes, epsilon, ngraph::op::EpsMode::ADD);
|
||||
|
||||
CV_Assert(blobs.empty() || numChannels == blobs[0].total());
|
||||
std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
|
||||
std::vector<size_t> shape(ieInpNode.get_shape().size(), 1);
|
||||
shape[0] = blobs.empty() ? 1 : batch;
|
||||
shape[1] = numChannels;
|
||||
if (!blobs.empty())
|
||||
|
@ -209,7 +209,8 @@ public:
|
||||
#ifdef HAVE_INF_ENGINE
|
||||
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
|
||||
{
|
||||
return type != STOCHASTIC && kernel_size.size() > 1 && (kernel_size.size() != 3 || !isArmComputePlugin());
|
||||
return type != STOCHASTIC && kernel_size.size() > 1 && (kernel_size.size() != 3 || !isArmComputePlugin()) &&
|
||||
(!computeMaxIdx || INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1));
|
||||
}
|
||||
#endif
|
||||
if (backendId == DNN_BACKEND_OPENCV)
|
||||
@ -600,7 +601,7 @@ public:
|
||||
return Ptr<BackendNode>(new InfEngineNgraphNode(ave_pool));
|
||||
}
|
||||
else if (type == SUM) {
|
||||
ngraph::Shape inpShape = ieInpNode->get_shape();
|
||||
ngraph::Shape inpShape = ieInpNode.get_shape();
|
||||
CV_Assert(inpShape.size() == 2 + kernel_size.size());
|
||||
std::vector<int64_t> axes;
|
||||
for (size_t i = 0; i < kernel_size.size(); i++)
|
||||
@ -615,10 +616,14 @@ public:
|
||||
else if (type == MAX) {
|
||||
std::shared_ptr<ngraph::Node> max_pool;
|
||||
if (computeMaxIdx) {
|
||||
#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
|
||||
std::vector<size_t> dilations(kernel_size.size(), 1);
|
||||
max_pool = std::make_shared<ngraph::op::v8::MaxPool>(ieInpNode, ngraph::Strides(strides), ngraph::Strides(dilations),
|
||||
ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
|
||||
rounding_type, pad_type);
|
||||
#else
|
||||
CV_Error(Error::StsNotImplemented, "OpenVINO MaxPool with indices");
|
||||
#endif
|
||||
} else {
|
||||
max_pool = std::make_shared<ngraph::op::v1::MaxPool>(ieInpNode, ngraph::Strides(strides),
|
||||
ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
|
||||
|
@ -366,10 +366,10 @@ public:
|
||||
auto& class_logits = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
auto& image_shape = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
|
||||
CV_Assert_N(image_shape->get_shape().size() == 2, image_shape->get_shape().front() == 1);
|
||||
CV_Assert_N(image_shape.get_shape().size() == 2, image_shape.get_shape().front() == 1);
|
||||
auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
|
||||
ngraph::Shape{1},
|
||||
std::vector<int64_t>{(int64_t)image_shape->get_shape().back()});
|
||||
std::vector<int64_t>{(int64_t)image_shape.get_shape().back()});
|
||||
auto reshape = std::make_shared<ngraph::op::v1::Reshape>(image_shape, shape, true);
|
||||
|
||||
auto proposal = std::make_shared<ngraph::op::Proposal>(class_probs, class_logits, reshape, attr);
|
||||
|
@ -466,7 +466,7 @@ public:
|
||||
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
|
||||
{
|
||||
auto& input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
auto parent_shape = input->get_shape();
|
||||
auto parent_shape = input.get_shape();
|
||||
int64_t b = parent_shape[0];
|
||||
int64_t h = parent_shape[1];
|
||||
int64_t w = parent_shape[2];
|
||||
@ -567,7 +567,7 @@ public:
|
||||
int hNorm, wNorm;
|
||||
if (nodes.size() > 1)
|
||||
{
|
||||
auto node_1_shape = nodes[1].dynamicCast<InfEngineNgraphNode>()->node->get_shape();
|
||||
auto node_1_shape = nodes[1].dynamicCast<InfEngineNgraphNode>()->node.get_shape();
|
||||
hNorm = node_1_shape[2];
|
||||
wNorm = node_1_shape[3];
|
||||
}
|
||||
|
@ -443,7 +443,7 @@ public:
|
||||
std::vector<int64_t> shape = {outHeight, outWidth};
|
||||
auto out_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shape.data());
|
||||
|
||||
auto& input_shape = ieInpNode->get_shape();
|
||||
auto& input_shape = ieInpNode.get_shape();
|
||||
CV_Assert_N(input_shape[2] != 0, input_shape[3] != 0);
|
||||
std::vector<float> scales = {static_cast<float>(outHeight) / input_shape[2], static_cast<float>(outWidth) / input_shape[3]};
|
||||
auto scales_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{2}, scales.data());
|
||||
|
@ -331,34 +331,36 @@ public:
|
||||
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
|
||||
{
|
||||
auto ieInpNode0 = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
auto ieInpNode1 = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
|
||||
ngraph::Output<ngraph::Node> ieInpNode1;
|
||||
if (nodes.size() > 1)
|
||||
ieInpNode1 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
|
||||
size_t numChannels = 1;
|
||||
if (blobs.empty())
|
||||
for (const size_t& dim : ieInpNode1->get_shape())
|
||||
for (const size_t& dim : ieInpNode1.get_shape())
|
||||
numChannels *= dim;
|
||||
else
|
||||
numChannels = blobs[0].total();
|
||||
|
||||
std::vector<size_t> shape(ieInpNode0->get_shape().size(), 1);
|
||||
std::vector<size_t> shape(ieInpNode0.get_shape().size(), 1);
|
||||
int cAxis = normalize_axis(axis, shape.size());
|
||||
shape[cAxis] = numChannels;
|
||||
|
||||
auto node = ieInpNode0;
|
||||
std::shared_ptr<ngraph::Node> node;
|
||||
if (hasWeights)
|
||||
{
|
||||
auto weight = blobs.empty() ? ieInpNode1 :
|
||||
ngraph::Output<ngraph::Node> weight = blobs.empty() ? ieInpNode1 :
|
||||
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), blobs[0].data);
|
||||
|
||||
#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2021_2)
|
||||
node = std::make_shared<ngraph::op::v1::Multiply>(node, weight, ngraph::op::AutoBroadcastType::NUMPY);
|
||||
node = std::make_shared<ngraph::op::v1::Multiply>(ieInpNode0, weight, ngraph::op::AutoBroadcastType::NUMPY);
|
||||
#else
|
||||
node = std::make_shared<ngraph::op::v0::Multiply>(node, weight, ngraph::op::AutoBroadcastType::NUMPY);
|
||||
node = std::make_shared<ngraph::op::v0::Multiply>(ieInpNode0, weight, ngraph::op::AutoBroadcastType::NUMPY);
|
||||
#endif
|
||||
}
|
||||
if (hasBias || !hasWeights)
|
||||
{
|
||||
std::shared_ptr<ngraph::Node> bias;
|
||||
ngraph::Output<ngraph::Node> bias;
|
||||
if (hasBias)
|
||||
{
|
||||
bias = blobs.empty() ? ieInpNode1 :
|
||||
|
@ -759,7 +759,7 @@ public:
|
||||
{
|
||||
CV_Assert_N(nodes.size() <= 2);
|
||||
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
CV_Assert(finalSliceRanges[0].size() == ieInpNode->get_shape().size());
|
||||
CV_Assert(finalSliceRanges[0].size() == ieInpNode.get_shape().size());
|
||||
|
||||
std::vector<int64_t> offsets, dims;
|
||||
for (int i = 0; i < finalSliceRanges[0].size(); ++i)
|
||||
|
@ -385,7 +385,7 @@ public:
|
||||
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
|
||||
{
|
||||
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
|
||||
int axis = normalize_axis(axisRaw, ieInpNode->get_shape().size());
|
||||
int axis = normalize_axis(axisRaw, ieInpNode.get_shape().size());
|
||||
auto softmax = std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis);
|
||||
if (logSoftMax)
|
||||
return Ptr<BackendNode>(new InfEngineNgraphNode(std::make_shared<ngraph::op::v0::Log>(softmax)));
|
||||
|
@ -210,7 +210,7 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
|
||||
if (!nextData->params.has("operation") || toLowerCase(nextData->params.get<String>("operation")) != "add")
|
||||
{
|
||||
CV_LOG_DEBUG(NULL, "DNN/CPU: fusion with NaryEltwise or Eltwise Layer operation is not supported: "
|
||||
<< nextData->params.get<String>("operation"));
|
||||
<< toLowerCase(nextData->params.get<String>("operation", "sum")));
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -252,7 +252,7 @@ void NetImplOpenVINO::addNgraphOutputs(LayerData& ld)
|
||||
CV_Assert(!ieInpNode->net.empty());
|
||||
if (layerNet != ieInpNode->net)
|
||||
{
|
||||
CV_LOG_DEBUG(NULL, "DNN/IE: pin output between subnets: " << ieInpNode->node->get_friendly_name());
|
||||
CV_LOG_DEBUG(NULL, "DNN/IE: pin output between subnets: " << ieInpNode->node.get_node()->get_friendly_name());
|
||||
ieInpNode->net->addOutput(ieInpNode);
|
||||
}
|
||||
}
|
||||
@ -321,8 +321,10 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
|
||||
return;
|
||||
}
|
||||
|
||||
#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2022_1)
|
||||
bool supportsCPUFallback = !isArmComputePlugin() && (preferableTarget == DNN_TARGET_CPU ||
|
||||
openvino::checkTarget(DNN_TARGET_CPU));
|
||||
#endif
|
||||
|
||||
// Build Inference Engine networks from sets of layers that support this
|
||||
// backend. Split a whole model on several Inference Engine networks if
|
||||
@ -341,6 +343,10 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
|
||||
|
||||
bool fused = ld.skip;
|
||||
Ptr<Layer> layer = ld.layerInstance;
|
||||
#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
|
||||
if (ld.id == 0)
|
||||
continue;
|
||||
#else
|
||||
if (!fused && !layer->supportBackend(preferableBackend))
|
||||
{
|
||||
CV_LOG_DEBUG(NULL, "DNN/IE: NOT supported!");
|
||||
@ -355,17 +361,6 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: fix these workarounds
|
||||
if (preferableTarget == DNN_TARGET_MYRIAD ||
|
||||
preferableTarget == DNN_TARGET_HDDL ||
|
||||
preferableTarget == DNN_TARGET_OPENCL ||
|
||||
preferableTarget == DNN_TARGET_OPENCL_FP16)
|
||||
customizable &= ld.type != "Concat";
|
||||
|
||||
if (preferableTarget == DNN_TARGET_OPENCL ||
|
||||
preferableTarget == DNN_TARGET_OPENCL_FP16)
|
||||
customizable &= ld.type != "Power";
|
||||
|
||||
if (preferableTarget == DNN_TARGET_OPENCL)
|
||||
customizable &= ld.type != "Eltwise";
|
||||
|
||||
@ -390,6 +385,7 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
|
||||
continue;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ld.skip = true; // Initially skip all Inference Engine supported layers.
|
||||
|
||||
// Create a new network if one of inputs from different Inference Engine graph.
|
||||
@ -478,7 +474,7 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
|
||||
int oid = ld.inputBlobsId[i].oid;
|
||||
|
||||
auto ieInpNode = inputNodes[i].dynamicCast<InfEngineNgraphNode>();
|
||||
const auto& ngraph_input_node = ieInpNode->node;
|
||||
const auto& ngraph_input_node = ieInpNode->node.get_node_shared_ptr();
|
||||
CV_LOG_DEBUG(NULL, "DNN/IE: bind output port " << lid << ":" << oid << " (" << ngraph_input_node->get_friendly_name() << ":" << ngraph_input_node->get_type_info().name << ")");
|
||||
|
||||
if ((oid == 0 && ngraph_input_node->get_output_size() == 1) || lid == 0)
|
||||
@ -498,10 +494,7 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
|
||||
}
|
||||
CV_CheckLT((size_t)oid, ngraph_input_node->get_output_size(), "");
|
||||
#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
|
||||
// FIXIT refactor ".initNgraph()" API to use Output<Node>
|
||||
// WA: use Concat to emulate Identity operation with requested output port
|
||||
auto oid_node = std::make_shared<ngraph::op::Concat>(ngraph::OutputVector { ngraph_input_node->output(oid) }, 0);
|
||||
inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(oid_node));
|
||||
inputNodes[i] = new InfEngineNgraphNode(ngraph_input_node->output(oid));
|
||||
#elif INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_3)
|
||||
inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(ieInpNode->node->get_output_as_single_output_node(oid)));
|
||||
#else
|
||||
@ -556,6 +549,36 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
|
||||
addNgraphOutputs(ld);
|
||||
}
|
||||
|
||||
// User may choose to return only intermediate blobs but not network's result (see Test_TFLite.max_unpooling)
|
||||
// Such layers should not be skipped when forwardLayer is called.
|
||||
// Also, perform a sanity check that there is no double inferred networks (a single skip=false per unique net instance)
|
||||
std::set<Ptr<InfEngineNgraphNet>> uniqueNets;
|
||||
if (!blobsToKeep_.empty())
|
||||
{
|
||||
LayerPin latestLayerPin = getLatestLayerPin(blobsToKeep_);
|
||||
for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it)
|
||||
{
|
||||
LayerData& ld = it->second;
|
||||
auto iter = ld.backendNodes.find(preferableBackend);
|
||||
if (iter == ld.backendNodes.end())
|
||||
continue;
|
||||
|
||||
Ptr<BackendNode>& node = iter->second;
|
||||
if (node.empty())
|
||||
continue;
|
||||
|
||||
Ptr<InfEngineNgraphNode> ieNode = node.dynamicCast<InfEngineNgraphNode>();
|
||||
if (ieNode.empty())
|
||||
continue;
|
||||
|
||||
if (ld.id == latestLayerPin.lid) {
|
||||
ld.skip = false;
|
||||
uniqueNets.insert(ieNode->net);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize all networks.
|
||||
for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
|
||||
{
|
||||
@ -578,9 +601,15 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
|
||||
{
|
||||
ieNode->net->addOutput(ieNode);
|
||||
ieNode->net->createNet((Target)preferableTarget);
|
||||
if (uniqueNets.find(ieNode->net) == uniqueNets.end()) {
|
||||
ld.skip = false;
|
||||
uniqueNets.insert(ieNode->net);
|
||||
}
|
||||
}
|
||||
}
|
||||
#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
|
||||
CV_Assert(uniqueNets.size() == 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "halide_scheduler.hpp"
|
||||
|
||||
#include <HalideRuntimeOpenCL.h>
|
||||
#include <thread>
|
||||
#endif // HAVE_HALIDE
|
||||
|
||||
namespace cv {
|
||||
|
@ -453,14 +453,14 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
|
||||
int w;
|
||||
for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(0, K, local_index))), 0, (__local float *)(slm_brow + mad24(0, SLM_BLOCK, local_index)));
|
||||
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(1, K, local_index))), 0, (__local float *)(slm_brow + mad24(1, SLM_BLOCK, local_index)));
|
||||
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(2, K, local_index))), 0, (__local float *)(slm_brow + mad24(2, SLM_BLOCK, local_index)));
|
||||
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(3, K, local_index))), 0, (__local float *)(slm_brow + mad24(3, SLM_BLOCK, local_index)));
|
||||
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(4, K, local_index))), 0, (__local float *)(slm_brow + mad24(4, SLM_BLOCK, local_index)));
|
||||
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(5, K, local_index))), 0, (__local float *)(slm_brow + mad24(5, SLM_BLOCK, local_index)));
|
||||
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(6, K, local_index))), 0, (__local float *)(slm_brow + mad24(6, SLM_BLOCK, local_index)));
|
||||
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(7, K, local_index))), 0, (__local float *)(slm_brow + mad24(7, SLM_BLOCK, local_index)));
|
||||
vstore8(vload8(0, src1_read0 + mad24(0, K, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index));
|
||||
vstore8(vload8(0, src1_read0 + mad24(1, K, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index));
|
||||
vstore8(vload8(0, src1_read0 + mad24(2, K, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index));
|
||||
vstore8(vload8(0, src1_read0 + mad24(3, K, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index));
|
||||
vstore8(vload8(0, src1_read0 + mad24(4, K, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index));
|
||||
vstore8(vload8(0, src1_read0 + mad24(5, K, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index));
|
||||
vstore8(vload8(0, src1_read0 + mad24(6, K, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index));
|
||||
vstore8(vload8(0, src1_read0 + mad24(7, K, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
slm_brow0 = slm_brow + local_x * (TILE_K / 8);
|
||||
@ -469,17 +469,17 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
|
||||
while( w + TILE_K <= end_w ) {
|
||||
Dtype8 arow;
|
||||
|
||||
brow0 = as_half8(vload4(0, (__local float *)(slm_brow0 + 0 * SLM_BLOCK)));
|
||||
brow1 = as_half8(vload4(0, (__local float *)(slm_brow0 + 1 * SLM_BLOCK)));
|
||||
brow2 = as_half8(vload4(0, (__local float *)(slm_brow0 + 2 * SLM_BLOCK)));
|
||||
brow3 = as_half8(vload4(0, (__local float *)(slm_brow0 + 3 * SLM_BLOCK)));
|
||||
brow4 = as_half8(vload4(0, (__local float *)(slm_brow0 + 4 * SLM_BLOCK)));
|
||||
brow5 = as_half8(vload4(0, (__local float *)(slm_brow0 + 5 * SLM_BLOCK)));
|
||||
brow6 = as_half8(vload4(0, (__local float *)(slm_brow0 + 6 * SLM_BLOCK)));
|
||||
brow7 = as_half8(vload4(0, (__local float *)(slm_brow0 + 7 * SLM_BLOCK)));
|
||||
brow0 = vload8(0, slm_brow0 + 0 * SLM_BLOCK);
|
||||
brow1 = vload8(0, slm_brow0 + 1 * SLM_BLOCK);
|
||||
brow2 = vload8(0, slm_brow0 + 2 * SLM_BLOCK);
|
||||
brow3 = vload8(0, slm_brow0 + 3 * SLM_BLOCK);
|
||||
brow4 = vload8(0, slm_brow0 + 4 * SLM_BLOCK);
|
||||
brow5 = vload8(0, slm_brow0 + 5 * SLM_BLOCK);
|
||||
brow6 = vload8(0, slm_brow0 + 6 * SLM_BLOCK);
|
||||
brow7 = vload8(0, slm_brow0 + 7 * SLM_BLOCK);
|
||||
|
||||
#define MM_DOT_PRODUCT( _row, _dot ) \
|
||||
arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K))); \
|
||||
arow = vload8(0, src0_read + _row * K); \
|
||||
_dot = mad( (Dtype8)(arow.s0), (Dtype8)(brow0.s0, brow1.s0, brow2.s0, brow3.s0, brow4.s0, brow5.s0, brow6.s0, brow7.s0), _dot ); \
|
||||
_dot = mad( (Dtype8)(arow.s1), (Dtype8)(brow0.s1, brow1.s1, brow2.s1, brow3.s1, brow4.s1, brow5.s1, brow6.s1, brow7.s1), _dot ); \
|
||||
_dot = mad( (Dtype8)(arow.s2), (Dtype8)(brow0.s2, brow1.s2, brow2.s2, brow3.s2, brow4.s2, brow5.s2, brow6.s2, brow7.s2), _dot ); \
|
||||
@ -510,7 +510,7 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
|
||||
Dtype8 arow;
|
||||
|
||||
#define READ_BROW(_brow, _row) \
|
||||
_brow = as_half8(vload4(0, (__local float *)(slm_brow0 + _row * SLM_BLOCK))); \
|
||||
_brow = vload8(0, slm_brow0 + _row * SLM_BLOCK); \
|
||||
_brow.s0 = (mad24(local_x, 8, w) < K) ? _brow.s0 : 0.0f; \
|
||||
_brow.s1 = (mad24(local_x, 8, w + 1) < K) ? _brow.s1 : 0.0f; \
|
||||
_brow.s2 = (mad24(local_x, 8, w + 2) < K) ? _brow.s2 : 0.0f; \
|
||||
@ -532,7 +532,7 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
|
||||
#undef READ_BROW
|
||||
|
||||
#define MM_DOT_PRODUCT( _row, _dot ) \
|
||||
arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K))); \
|
||||
arow = vload8(0, src0_read + _row * K); \
|
||||
arow.s0 = (mad24(local_x, 8, w) < K) ? arow.s0 : 0.0f; \
|
||||
arow.s1 = (mad24(local_x, 8, w + 1) < K) ? arow.s1 : 0.0f; \
|
||||
arow.s2 = (mad24(local_x, 8, w + 2) < K) ? arow.s2 : 0.0f; \
|
||||
|
@ -1,53 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* License); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2020, OPEN AI LAB
|
||||
* Author: qtang@openailab.com
|
||||
*/
|
||||
|
||||
#ifndef TENGINE_GRAPH_CONVOLUTION_HPP
|
||||
#define TENGINE_GRAPH_CONVOLUTION_HPP
|
||||
|
||||
#define FLOAT_TO_REALSIZE (4)
|
||||
#ifdef HAVE_TENGINE
|
||||
|
||||
#include "tengine_c_api.h"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace dnn
|
||||
{
|
||||
// pad_h0: pad_top
|
||||
// pad_h1: pad_bottom
|
||||
// pad_w0: pad_left
|
||||
// pad_w1: pad_right
|
||||
teng_graph_t tengine_init(const char* name , float* input_, int inch, int group, int in_h, int in_w,
|
||||
float *output_, int out_b, int outch, int out_h, int out_w,
|
||||
float *kernel_,int kernel_s , int kernel_h, int kernel_w,
|
||||
float *teg_bias, int stride_h, int stride_w,
|
||||
int pad_h0, int pad_h1, int pad_w0, int pad_w1, int dilation_h, int dilation_w,
|
||||
size_t wstep, const std::string padMode , teng_graph_t& graph, int nstripes) ;
|
||||
|
||||
bool tengine_forward(teng_graph_t& graph) ;
|
||||
bool tengine_release(teng_graph_t& graph) ;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif /* TENGINE_GRAPH_CONVOLUTION_HPP */
|
@ -1,370 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* License); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2020, OPEN AI LAB
|
||||
* Author: qtang@openailab.com
|
||||
*/
|
||||
|
||||
#include "../../precomp.hpp"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include <opencv2/core/utils/configuration.private.hpp>
|
||||
#include <opencv2/core/utils/logger.hpp>
|
||||
|
||||
#include "../include/tengine_graph_convolution.hpp"
|
||||
|
||||
#ifdef HAVE_TENGINE
|
||||
|
||||
#include "tengine_c_api.h"
|
||||
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace dnn
|
||||
{
|
||||
static int create_input_node(teng_graph_t graph, const char* node_name, int inch, int in_h, int in_w)
|
||||
{
|
||||
node_t node = teng_create_graph_node(graph, node_name, "InputOp");
|
||||
tensor_t tensor = teng_create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
|
||||
teng_set_node_output_tensor(node, 0, tensor, TENSOR_TYPE_INPUT);
|
||||
|
||||
int dims[4] = {1, inch, in_h, in_w};
|
||||
teng_set_tensor_shape(tensor, dims, 4);
|
||||
|
||||
teng_release_graph_tensor(tensor);
|
||||
teng_release_graph_node(node);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int create_conv_node(teng_graph_t graph, const char* node_name, const char* input_name, int in_h, int in_w, int out_h, int out_w,
|
||||
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h0, int pad_h1, int pad_w0, int pad_w1, int inch, int outch, int group,
|
||||
int dilation_h, int dilation_w, int activation, std::string padMode)
|
||||
{
|
||||
node_t conv_node = teng_create_graph_node(graph, node_name, "Convolution");
|
||||
tensor_t input_tensor = teng_get_graph_tensor(graph, input_name);
|
||||
|
||||
if (input_tensor == NULL)
|
||||
{
|
||||
CV_LOG_WARNING(NULL,"Tengine: input_tensor is NULL." );
|
||||
return -1;
|
||||
}
|
||||
|
||||
teng_set_node_input_tensor(conv_node, 0, input_tensor);
|
||||
teng_release_graph_tensor(input_tensor);
|
||||
|
||||
/* output */
|
||||
tensor_t output_tensor = teng_create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
|
||||
|
||||
teng_set_node_output_tensor(conv_node, 0, output_tensor, TENSOR_TYPE_VAR);
|
||||
teng_release_graph_tensor(output_tensor);
|
||||
|
||||
/* weight */
|
||||
std::string weight_name(node_name);
|
||||
weight_name += "/weight";
|
||||
|
||||
node_t w_node = teng_create_graph_node(graph, weight_name.c_str(), "Const");
|
||||
tensor_t w_tensor = teng_create_graph_tensor(graph, weight_name.c_str(), TENGINE_DT_FP32);
|
||||
teng_set_node_output_tensor(w_node, 0, w_tensor, TENSOR_TYPE_CONST);
|
||||
teng_set_node_input_tensor(conv_node, 1, w_tensor);
|
||||
int w_dims[] = {outch, inch / group, kernel_h, kernel_w};
|
||||
|
||||
teng_set_tensor_shape(w_tensor, w_dims, 4);
|
||||
|
||||
teng_release_graph_node(w_node);
|
||||
teng_release_graph_tensor(w_tensor);
|
||||
|
||||
/* bias */
|
||||
std::string bias_name(node_name);
|
||||
bias_name += "/bias";
|
||||
|
||||
node_t b_node = teng_create_graph_node(graph, bias_name.c_str(), "Const");
|
||||
tensor_t b_tensor = teng_create_graph_tensor(graph, bias_name.c_str(), TENGINE_DT_FP32);
|
||||
teng_set_node_output_tensor(b_node, 0, b_tensor, TENSOR_TYPE_CONST);
|
||||
int b_dims[] = {outch};
|
||||
|
||||
teng_set_tensor_shape(b_tensor, b_dims, 1);
|
||||
|
||||
teng_set_node_input_tensor(conv_node, 2, b_tensor);
|
||||
teng_release_graph_node(b_node);
|
||||
teng_release_graph_tensor(b_tensor);
|
||||
|
||||
if (!padMode.empty())
|
||||
{
|
||||
if (padMode == "SAME")
|
||||
{
|
||||
int out_h_temp = (in_h-kernel_h + 2*pad_h0)/stride_h + 1;
|
||||
int out_w_temp = (in_w-kernel_w + 2*pad_w0)/stride_w + 1;
|
||||
|
||||
if (out_h_temp < out_h)
|
||||
pad_h1 += 1;
|
||||
if (out_w_temp < out_w)
|
||||
pad_w1 += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* attr */
|
||||
teng_set_node_attr_int(conv_node, "kernel_h", &kernel_h);
|
||||
teng_set_node_attr_int(conv_node, "kernel_w", &kernel_w);
|
||||
teng_set_node_attr_int(conv_node, "stride_h", &stride_h);
|
||||
teng_set_node_attr_int(conv_node, "stride_w", &stride_w);
|
||||
teng_set_node_attr_int(conv_node, "pad_h0", &pad_h0);
|
||||
teng_set_node_attr_int(conv_node, "pad_w0", &pad_w0);
|
||||
teng_set_node_attr_int(conv_node, "pad_h1", &pad_h1);
|
||||
teng_set_node_attr_int(conv_node, "pad_w1", &pad_w1);
|
||||
teng_set_node_attr_int(conv_node, "output_channel", &outch);
|
||||
teng_set_node_attr_int(conv_node, "input_channel", &inch);
|
||||
teng_set_node_attr_int(conv_node, "group", &group);
|
||||
teng_set_node_attr_int(conv_node, "dilation_h", &dilation_h);
|
||||
teng_set_node_attr_int(conv_node, "dilation_w", &dilation_w);
|
||||
// set_node_attr_int(conv_node, "activation", &activation);
|
||||
|
||||
teng_release_graph_node(conv_node);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static teng_graph_t create_conv_graph(const char* layer_name, float* input_data, int inch, int group, int in_h, int in_w,
|
||||
float* output_data, int outch, int out_h, int out_w,
|
||||
int kernel_h, int kernel_w,
|
||||
int stride_h,int stride_w,
|
||||
int pad_h0, int pad_h1, int pad_w0, int pad_w1, int dilation_h, int dilation_w, int activation,
|
||||
float* teg_weight, float* teg_bias, std::string padMode, int nstripes)
|
||||
{
|
||||
node_t conv_node = NULL;
|
||||
|
||||
tensor_t input_tensor = NULL;
|
||||
tensor_t output_tensor = NULL;
|
||||
tensor_t weight_tensor = NULL;
|
||||
tensor_t bias_tensor = NULL;
|
||||
|
||||
/* create graph for convolution */
|
||||
int in_size = in_h * in_w * inch;
|
||||
int out_size = out_h * out_w * outch;
|
||||
int weight_size = outch * (inch / group) * kernel_w * kernel_h;
|
||||
int bias_size = outch;
|
||||
|
||||
int buf_size = 0;
|
||||
int input_num = 0;
|
||||
|
||||
/* create graph */
|
||||
teng_graph_t graph = teng_create_graph(NULL, NULL, NULL);
|
||||
bool ok = true;
|
||||
|
||||
if(graph == NULL)
|
||||
{
|
||||
CV_LOG_WARNING(NULL,"Tengine: create_graph failed." );
|
||||
ok = false;
|
||||
}
|
||||
|
||||
const char* input_name = "data";
|
||||
const char* conv_name = layer_name;
|
||||
|
||||
if (ok && create_input_node(graph, input_name, inch, in_h, in_w) < 0)
|
||||
{
|
||||
CV_LOG_WARNING(NULL,"Tengine: create_input_node failed." );
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (ok && create_conv_node(graph, conv_name, input_name, in_h, in_w, out_h, out_w, kernel_h, kernel_w,
|
||||
stride_h, stride_w, pad_h0, pad_h1, pad_w0, pad_w1, inch, outch, group, dilation_h, dilation_w, activation, padMode) < 0)
|
||||
{
|
||||
CV_LOG_WARNING(NULL,"Tengine: create conv node failed." );
|
||||
ok = false;
|
||||
}
|
||||
|
||||
/* set input/output node */
|
||||
const char* inputs_name[] = {input_name};
|
||||
const char* outputs_name[] = {conv_name};
|
||||
|
||||
if (ok && teng_set_graph_input_node(graph, inputs_name, sizeof(inputs_name) / sizeof(char*)) < 0)
|
||||
{
|
||||
CV_LOG_WARNING(NULL,"Tengine: set inputs failed." );
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (ok && teng_set_graph_output_node(graph, outputs_name, sizeof(outputs_name) / sizeof(char*)) < 0)
|
||||
{
|
||||
CV_LOG_WARNING(NULL,"Tengine: set outputs failed." );
|
||||
ok = false;
|
||||
}
|
||||
|
||||
/* set input data */
|
||||
if (ok)
|
||||
{
|
||||
input_tensor = teng_get_graph_input_tensor(graph, 0, 0);
|
||||
buf_size = teng_get_tensor_buffer_size(input_tensor);
|
||||
if (buf_size != in_size * FLOAT_TO_REALSIZE)
|
||||
{
|
||||
CV_LOG_WARNING(NULL,"Tengine: Input data size check failed.");
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (ok)
|
||||
{
|
||||
teng_set_tensor_buffer(input_tensor, (float *)input_data, buf_size);
|
||||
teng_release_graph_tensor(input_tensor);
|
||||
|
||||
/* create convolution node */
|
||||
/* set weight node */
|
||||
conv_node = teng_get_graph_node(graph, conv_name);
|
||||
weight_tensor = teng_get_node_input_tensor(conv_node, 1);
|
||||
buf_size = teng_get_tensor_buffer_size(weight_tensor);
|
||||
|
||||
if (buf_size != weight_size * FLOAT_TO_REALSIZE)
|
||||
{
|
||||
CV_LOG_WARNING(NULL,"Tengine: Input weight size check failed.");
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (ok)
|
||||
{
|
||||
teng_set_tensor_buffer(weight_tensor, teg_weight, buf_size);
|
||||
|
||||
/* set bias node */
|
||||
input_num = teng_get_node_input_number(conv_node);
|
||||
if (input_num > 2)
|
||||
{
|
||||
bias_tensor = teng_get_node_input_tensor(conv_node, 2);
|
||||
buf_size = teng_get_tensor_buffer_size(bias_tensor);
|
||||
if (buf_size != bias_size * FLOAT_TO_REALSIZE)
|
||||
{
|
||||
CV_LOG_WARNING(NULL,"Tengine: Input bias size check failed.");
|
||||
ok = false;
|
||||
}
|
||||
else teng_set_tensor_buffer(bias_tensor, teg_bias, buf_size);
|
||||
}
|
||||
}
|
||||
|
||||
/* prerun */
|
||||
if (ok && teng_prerun_graph_multithread(graph, TENGINE_CLUSTER_BIG, nstripes) < 0)
|
||||
{
|
||||
CV_LOG_WARNING(NULL, "Tengine: prerun_graph failed.");
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (ok)
|
||||
{
|
||||
/* set output data */
|
||||
output_tensor = teng_get_node_output_tensor(conv_node, 0);
|
||||
int ret = teng_set_tensor_buffer(output_tensor, output_data, out_size * FLOAT_TO_REALSIZE);
|
||||
if(ret)
|
||||
{
|
||||
CV_LOG_WARNING(NULL,"Tengine: Set output tensor buffer failed." );
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (false == ok)
|
||||
{
|
||||
teng_destroy_graph(graph) ;
|
||||
return NULL ;
|
||||
}
|
||||
return graph;
|
||||
}
|
||||
static bool tengine_init_flag = false;
|
||||
teng_graph_t tengine_init(const char* layer_name, float* input_, int inch, int group, int in_h, int in_w,
|
||||
float *output_, int out_b, int outch, int out_h, int out_w,
|
||||
float *kernel_, int kernel_s ,int kernel_h, int kernel_w,
|
||||
float *teg_bias, int stride_h, int stride_w,
|
||||
int pad_h0, int pad_h1, int pad_w0, int pad_w1, int dilation_h, int dilation_w,
|
||||
size_t wstep, const std::string padMode, teng_graph_t &graph, int nstripes)
|
||||
{
|
||||
std::vector<float> teg_weight_vec;
|
||||
float *teg_weight = NULL;
|
||||
int kernel_inwh = (inch / group) * kernel_w * kernel_h;
|
||||
// Do not using the activation fuse mode, just convolution only.
|
||||
int activation = -1;
|
||||
|
||||
if (!(kernel_s == 2 && kernel_h == kernel_w
|
||||
&& dilation_h == dilation_w && stride_h == stride_w
|
||||
&& out_b == 1 && pad_h0 < 10 && pad_h1 < 10 && pad_w0 < 10 && pad_w1 < 10)) // just for Conv2D
|
||||
{
|
||||
// printf("return : just for Conv2D\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
{
|
||||
/* printf("Tengine(%s): input (1 x %d x %d x %d),output (%d x %d x %d x %d), kernel (%d x %d), stride (%d x %d), dilation (%d x %d), pad (%d x %d).\n",
|
||||
layer_name, inch, in_h, in_w,
|
||||
out_b, outch, out_h, out_w,
|
||||
kernel_w, kernel_h,
|
||||
stride_w, stride_h,
|
||||
dilation_w, dilation_h,
|
||||
pad_h0, pad_h1, pad_w0, pad_w1);
|
||||
*/
|
||||
// weight
|
||||
if (kernel_inwh != wstep)
|
||||
{
|
||||
teg_weight_vec.resize(kernel_inwh * outch);
|
||||
teg_weight = &teg_weight_vec[0];
|
||||
for (int i=0; i<outch; i++)
|
||||
{
|
||||
memcpy(teg_weight+i*kernel_inwh, kernel_+i*wstep, kernel_inwh*FLOAT_TO_REALSIZE);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
teg_weight = kernel_;
|
||||
}
|
||||
|
||||
/* initial the resource of tengine */
|
||||
if(false == tengine_init_flag)
|
||||
{
|
||||
init_tengine();
|
||||
tengine_init_flag = true;
|
||||
}
|
||||
|
||||
/* create the convolution graph */
|
||||
graph = create_conv_graph(layer_name, input_, inch, group, in_h, in_w,
|
||||
output_, outch, out_h, out_w,
|
||||
kernel_h, kernel_w, stride_h,stride_w,
|
||||
pad_h0, pad_h1, pad_w0, pad_w1, dilation_h, dilation_w, activation,
|
||||
teg_weight, teg_bias, padMode, nstripes);
|
||||
if(NULL == graph )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return graph ;
|
||||
}
|
||||
|
||||
bool tengine_forward(teng_graph_t &graph)
|
||||
{
|
||||
/* run */
|
||||
if(teng_run_graph(graph, 1) < 0)
|
||||
{
|
||||
CV_LOG_WARNING(NULL,"Tengine: run_graph failed.");
|
||||
return false ;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool tengine_release(teng_graph_t &graph)
|
||||
{
|
||||
teng_postrun_graph(graph);
|
||||
teng_destroy_graph(graph);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
@ -194,7 +194,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
|
||||
float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 1.5e-2 : 0.0;
|
||||
float iouDiff = (target == DNN_TARGET_MYRIAD) ? 0.063 : 0.0;
|
||||
float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.262 : FLT_MIN;
|
||||
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
|
||||
processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
|
||||
inp, "detection_out", "", scoreDiff, iouDiff, detectionConfThresh);
|
||||
expectNoFallbacksFromIE(net);
|
||||
}
|
||||
@ -237,7 +237,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height)
|
||||
scoreDiff = 0.03;
|
||||
iouDiff = 0.08;
|
||||
}
|
||||
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
|
||||
processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
|
||||
inp, "detection_out", "", scoreDiff, iouDiff);
|
||||
expectNoFallbacksFromIE(net);
|
||||
}
|
||||
|
@ -290,8 +290,8 @@ TEST(Reproducibility_SSD, Accuracy)
|
||||
typedef testing::TestWithParam<tuple<Backend, Target> > Reproducibility_MobileNet_SSD;
|
||||
TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
|
||||
{
|
||||
const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false);
|
||||
const string model = findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false);
|
||||
const string proto = findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", false);
|
||||
const string model = findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", false);
|
||||
Net net = readNetFromCaffe(proto, model);
|
||||
int backendId = get<0>(GetParam());
|
||||
int targetId = get<1>(GetParam());
|
||||
@ -731,7 +731,7 @@ TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
|
||||
#endif
|
||||
|
||||
double scoreDiff = 0.0, iouDiff = 0.0;
|
||||
double scoreDiff = 0.001, iouDiff = 0.03;
|
||||
#if defined(INF_ENGINE_RELEASE)
|
||||
if (target == DNN_TARGET_MYRIAD)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
|
||||
@ -779,7 +779,7 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf)
|
||||
0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
|
||||
0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
|
||||
|
||||
double scoreDiff = 0.0, iouDiff = 0.0;
|
||||
double scoreDiff = 0.003, iouDiff = 0.07;
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
|
||||
scoreDiff = 0.02;
|
||||
iouDiff = 0.13;
|
||||
|
@ -407,15 +407,16 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, MaxPooling, Combine(
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Fully-connected
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
typedef TestWithParam<tuple<int, Size, int, bool, tuple<Backend, Target> > > FullyConnected;
|
||||
typedef TestWithParam<tuple<int, int, Size, int, bool, tuple<Backend, Target> > > FullyConnected;
|
||||
TEST_P(FullyConnected, Accuracy)
|
||||
{
|
||||
int inChannels = get<0>(GetParam());
|
||||
Size inSize = get<1>(GetParam());
|
||||
int outChannels = get<2>(GetParam());
|
||||
bool hasBias = get<3>(GetParam());
|
||||
Backend backendId = get<0>(get<4>(GetParam()));
|
||||
Target targetId = get<1>(get<4>(GetParam()));
|
||||
int batch = get<0>(GetParam());
|
||||
int inChannels = get<1>(GetParam());
|
||||
Size inSize = get<2>(GetParam());
|
||||
int outChannels = get<3>(GetParam());
|
||||
bool hasBias = get<4>(GetParam());
|
||||
Backend backendId = get<0>(get<5>(GetParam()));
|
||||
Target targetId = get<1>(get<5>(GetParam()));
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
|
||||
if ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
|
||||
backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && (targetId == DNN_TARGET_OPENCL_FP16 ||
|
||||
@ -424,6 +425,13 @@ TEST_P(FullyConnected, Accuracy)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
|
||||
}
|
||||
#endif
|
||||
// https://github.com/openvinotoolkit/openvino/issues/19436
|
||||
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL_FP16 && batch == 16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2023000000)
|
||||
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL && batch == 16)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL);
|
||||
#endif
|
||||
|
||||
Mat weights(outChannels, inChannels * inSize.height * inSize.width, CV_32F);
|
||||
randu(weights, -1.0f, 1.0f);
|
||||
@ -439,7 +447,7 @@ TEST_P(FullyConnected, Accuracy)
|
||||
lp.type = "InnerProduct";
|
||||
lp.name = "testLayer";
|
||||
|
||||
int sz[] = {1, inChannels, inSize.height, inSize.width};
|
||||
int sz[] = {batch, inChannels, inSize.height, inSize.width};
|
||||
Mat input(4, &sz[0], CV_32F);
|
||||
|
||||
double l1 = 0.0;
|
||||
@ -453,11 +461,13 @@ TEST_P(FullyConnected, Accuracy)
|
||||
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL_FP16)
|
||||
{
|
||||
l1 = 0.01;
|
||||
if (INF_ENGINE_VER_MAJOR_GE(2023000000))
|
||||
lInf = 0.016;
|
||||
}
|
||||
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL)
|
||||
{
|
||||
l1 = 5e-3;
|
||||
lInf = 7e-3;
|
||||
lInf = INF_ENGINE_VER_MAJOR_GE(2023000000) ? 0.016 : 7e-3;
|
||||
}
|
||||
#endif
|
||||
if (targetId == DNN_TARGET_CUDA_FP16)
|
||||
@ -467,6 +477,7 @@ TEST_P(FullyConnected, Accuracy)
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, FullyConnected, Combine(
|
||||
/*batch*/ Values(1, 2, 4, 8, 16),
|
||||
/*in channels*/ Values(3, 4),
|
||||
/*in size*/ Values(Size(5, 4), Size(4, 5), Size(1, 1)),
|
||||
/*out channels*/ Values(3, 4),
|
||||
|
@ -878,14 +878,14 @@ TEST_P(Test_Int8_nets, MobileNet_SSD)
|
||||
if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
|
||||
|
||||
Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy.prototxt", false),
|
||||
findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false));
|
||||
Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", false),
|
||||
findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", false));
|
||||
|
||||
Mat inp = imread(_tf("street.png"));
|
||||
Mat blob = blobFromImage(inp, 1.0 / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
|
||||
Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
|
||||
|
||||
float confThreshold = FLT_MIN, scoreDiff = 0.059, iouDiff = 0.11;
|
||||
float confThreshold = FLT_MIN, scoreDiff = 0.084, iouDiff = 0.43;
|
||||
testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
|
||||
}
|
||||
|
||||
|
@ -120,6 +120,28 @@ TEST(blobFromImageWithParams_4ch, letter_box)
|
||||
EXPECT_EQ(0, cvtest::norm(targetBlob, blob, NORM_INF));
|
||||
}
|
||||
|
||||
TEST(blobFromImagesWithParams_4ch, multi_image)
|
||||
{
|
||||
Mat img(10, 10, CV_8UC4, cv::Scalar(0, 1, 2, 3));
|
||||
Scalar scalefactor(0.1, 0.2, 0.3, 0.4);
|
||||
|
||||
Image2BlobParams param;
|
||||
param.scalefactor = scalefactor;
|
||||
param.datalayout = DNN_LAYOUT_NHWC;
|
||||
|
||||
Mat blobs = blobFromImagesWithParams(std::vector<Mat> { img, 2*img }, param);
|
||||
vector<Range> ranges;
|
||||
ranges.push_back(Range(0, 1));
|
||||
ranges.push_back(Range(0, blobs.size[1]));
|
||||
ranges.push_back(Range(0, blobs.size[2]));
|
||||
ranges.push_back(Range(0, blobs.size[3]));
|
||||
Mat blob0 = blobs(ranges);
|
||||
ranges[0] = Range(1, 2);
|
||||
Mat blob1 = blobs(ranges);
|
||||
|
||||
EXPECT_EQ(0, cvtest::norm(2*blob0, blob1, NORM_INF));
|
||||
}
|
||||
|
||||
TEST(readNet, Regression)
|
||||
{
|
||||
Net net = readNet(findDataFile("dnn/squeezenet_v1.1.prototxt"),
|
||||
|
@ -490,8 +490,8 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
|
||||
refBoxes.emplace_back(left, top, width, height);
|
||||
}
|
||||
|
||||
std::string weights_file = _tf("MobileNetSSD_deploy.caffemodel", false);
|
||||
std::string config_file = _tf("MobileNetSSD_deploy.prototxt");
|
||||
std::string weights_file = _tf("MobileNetSSD_deploy_19e3ec3.caffemodel", false);
|
||||
std::string config_file = _tf("MobileNetSSD_deploy_19e3ec3.prototxt");
|
||||
|
||||
Scalar mean = Scalar(127.5, 127.5, 127.5);
|
||||
double scale = 1.0 / 127.5;
|
||||
@ -511,7 +511,7 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
|
||||
}
|
||||
else if (target == DNN_TARGET_CUDA_FP16)
|
||||
{
|
||||
scoreDiff = 0.0021;
|
||||
scoreDiff = 0.0028;
|
||||
iouDiff = 1e-2;
|
||||
}
|
||||
float confThreshold = FLT_MIN;
|
||||
@ -595,8 +595,8 @@ TEST_P(Test_Model, Detection_normalized)
|
||||
std::vector<float> refConfidences = {0.999222f};
|
||||
std::vector<Rect2d> refBoxes = {Rect2d(0, 4, 227, 222)};
|
||||
|
||||
std::string weights_file = _tf("MobileNetSSD_deploy.caffemodel", false);
|
||||
std::string config_file = _tf("MobileNetSSD_deploy.prototxt");
|
||||
std::string weights_file = _tf("MobileNetSSD_deploy_19e3ec3.caffemodel", false);
|
||||
std::string config_file = _tf("MobileNetSSD_deploy_19e3ec3.prototxt");
|
||||
|
||||
Scalar mean = Scalar(127.5, 127.5, 127.5);
|
||||
double scale = 1.0 / 127.5;
|
||||
|
@ -128,6 +128,11 @@ TEST_P(Test_TFLite, max_unpooling)
|
||||
if (backend == DNN_BACKEND_CUDA)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA);
|
||||
|
||||
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2022010000)
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
|
||||
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
|
||||
#endif
|
||||
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target != DNN_TARGET_CPU) {
|
||||
if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
|
||||
if (target == DNN_TARGET_OPENCL) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
|
||||
@ -152,14 +157,7 @@ TEST_P(Test_TFLite, max_unpooling)
|
||||
net.setInput(input);
|
||||
|
||||
std::vector<std::vector<Mat> > outs;
|
||||
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
|
||||
// TODO: seems like a bug with a retrieving intermediate tensors
|
||||
net.forward(outs, {"conv2d_transpose_4", "p_re_lu_1", "max_pooling_with_argmax2d", "conv2d_86", "max_unpooling2d_2"});
|
||||
outs.erase(outs.begin());
|
||||
}
|
||||
else {
|
||||
net.forward(outs, {"p_re_lu_1", "max_pooling_with_argmax2d", "conv2d_86", "max_unpooling2d_2"});
|
||||
}
|
||||
|
||||
ASSERT_EQ(outs.size(), 4);
|
||||
ASSERT_EQ(outs[0].size(), 1);
|
||||
|
135
modules/features2d/3rdparty/mscr/chi_table.h
vendored
Normal file
135
modules/features2d/3rdparty/mscr/chi_table.h
vendored
Normal file
@ -0,0 +1,135 @@
|
||||
/*
|
||||
**
|
||||
** License Agreement
|
||||
** For chi_table.h
|
||||
**
|
||||
** Copyright (C) 2007 Per-Erik Forssen, all rights reserved.
|
||||
**
|
||||
** Redistribution and use in source and binary forms, with or without modification,
|
||||
** are permitted provided that the following conditions are met:
|
||||
**
|
||||
** * Redistribution's of source code must retain the above copyright notice,
|
||||
** this list of conditions and the following disclaimer.
|
||||
**
|
||||
** * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
** this list of conditions and the following disclaimer in the documentation
|
||||
** and/or other materials provided with the distribution.
|
||||
**
|
||||
** * The name of the copyright holders may not be used to endorse or promote products
|
||||
** derived from this software without specific prior written permission.
|
||||
**
|
||||
** This software is provided by the copyright holders and contributors "as is" and
|
||||
** any express or implied warranties, including, but not limited to, the implied
|
||||
** warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
** In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
** indirect, incidental, special, exemplary, or consequential damages
|
||||
** (including, but not limited to, procurement of substitute goods or services;
|
||||
** loss of use, data, or profits; or business interruption) however caused
|
||||
** and on any theory of liability, whether in contract, strict liability,
|
||||
** or tort (including negligence or otherwise) arising in any way out of
|
||||
** the use of this software, even if advised of the possibility of such damage.
|
||||
**
|
||||
** Content origin: http://users.isy.liu.se/cvl/perfo/software/chi_table.h
|
||||
*/
|
||||
#define TABLE_SIZE 400
|
||||
|
||||
static double chitab3[]={0, 0.0150057, 0.0239478, 0.0315227,
|
||||
0.0383427, 0.0446605, 0.0506115, 0.0562786,
|
||||
0.0617174, 0.0669672, 0.0720573, 0.0770099,
|
||||
0.081843, 0.0865705, 0.0912043, 0.0957541,
|
||||
0.100228, 0.104633, 0.108976, 0.113261,
|
||||
0.117493, 0.121676, 0.125814, 0.12991,
|
||||
0.133967, 0.137987, 0.141974, 0.145929,
|
||||
0.149853, 0.15375, 0.15762, 0.161466,
|
||||
0.165287, 0.169087, 0.172866, 0.176625,
|
||||
0.180365, 0.184088, 0.187794, 0.191483,
|
||||
0.195158, 0.198819, 0.202466, 0.2061,
|
||||
0.209722, 0.213332, 0.216932, 0.220521,
|
||||
0.2241, 0.22767, 0.231231, 0.234783,
|
||||
0.238328, 0.241865, 0.245395, 0.248918,
|
||||
0.252435, 0.255947, 0.259452, 0.262952,
|
||||
0.266448, 0.269939, 0.273425, 0.276908,
|
||||
0.280386, 0.283862, 0.287334, 0.290803,
|
||||
0.29427, 0.297734, 0.301197, 0.304657,
|
||||
0.308115, 0.311573, 0.315028, 0.318483,
|
||||
0.321937, 0.32539, 0.328843, 0.332296,
|
||||
0.335749, 0.339201, 0.342654, 0.346108,
|
||||
0.349562, 0.353017, 0.356473, 0.35993,
|
||||
0.363389, 0.366849, 0.37031, 0.373774,
|
||||
0.377239, 0.380706, 0.384176, 0.387648,
|
||||
0.391123, 0.3946, 0.39808, 0.401563,
|
||||
0.405049, 0.408539, 0.412032, 0.415528,
|
||||
0.419028, 0.422531, 0.426039, 0.429551,
|
||||
0.433066, 0.436586, 0.440111, 0.44364,
|
||||
0.447173, 0.450712, 0.454255, 0.457803,
|
||||
0.461356, 0.464915, 0.468479, 0.472049,
|
||||
0.475624, 0.479205, 0.482792, 0.486384,
|
||||
0.489983, 0.493588, 0.4972, 0.500818,
|
||||
0.504442, 0.508073, 0.511711, 0.515356,
|
||||
0.519008, 0.522667, 0.526334, 0.530008,
|
||||
0.533689, 0.537378, 0.541075, 0.54478,
|
||||
0.548492, 0.552213, 0.555942, 0.55968,
|
||||
0.563425, 0.56718, 0.570943, 0.574715,
|
||||
0.578497, 0.582287, 0.586086, 0.589895,
|
||||
0.593713, 0.597541, 0.601379, 0.605227,
|
||||
0.609084, 0.612952, 0.61683, 0.620718,
|
||||
0.624617, 0.628526, 0.632447, 0.636378,
|
||||
0.64032, 0.644274, 0.648239, 0.652215,
|
||||
0.656203, 0.660203, 0.664215, 0.668238,
|
||||
0.672274, 0.676323, 0.680384, 0.684457,
|
||||
0.688543, 0.692643, 0.696755, 0.700881,
|
||||
0.70502, 0.709172, 0.713339, 0.717519,
|
||||
0.721714, 0.725922, 0.730145, 0.734383,
|
||||
0.738636, 0.742903, 0.747185, 0.751483,
|
||||
0.755796, 0.760125, 0.76447, 0.768831,
|
||||
0.773208, 0.777601, 0.782011, 0.786438,
|
||||
0.790882, 0.795343, 0.799821, 0.804318,
|
||||
0.808831, 0.813363, 0.817913, 0.822482,
|
||||
0.827069, 0.831676, 0.836301, 0.840946,
|
||||
0.84561, 0.850295, 0.854999, 0.859724,
|
||||
0.864469, 0.869235, 0.874022, 0.878831,
|
||||
0.883661, 0.888513, 0.893387, 0.898284,
|
||||
0.903204, 0.908146, 0.913112, 0.918101,
|
||||
0.923114, 0.928152, 0.933214, 0.938301,
|
||||
0.943413, 0.94855, 0.953713, 0.958903,
|
||||
0.964119, 0.969361, 0.974631, 0.979929,
|
||||
0.985254, 0.990608, 0.99599, 1.0014,
|
||||
1.00684, 1.01231, 1.01781, 1.02335,
|
||||
1.02891, 1.0345, 1.04013, 1.04579,
|
||||
1.05148, 1.05721, 1.06296, 1.06876,
|
||||
1.07459, 1.08045, 1.08635, 1.09228,
|
||||
1.09826, 1.10427, 1.11032, 1.1164,
|
||||
1.12253, 1.1287, 1.1349, 1.14115,
|
||||
1.14744, 1.15377, 1.16015, 1.16656,
|
||||
1.17303, 1.17954, 1.18609, 1.19269,
|
||||
1.19934, 1.20603, 1.21278, 1.21958,
|
||||
1.22642, 1.23332, 1.24027, 1.24727,
|
||||
1.25433, 1.26144, 1.26861, 1.27584,
|
||||
1.28312, 1.29047, 1.29787, 1.30534,
|
||||
1.31287, 1.32046, 1.32812, 1.33585,
|
||||
1.34364, 1.3515, 1.35943, 1.36744,
|
||||
1.37551, 1.38367, 1.39189, 1.4002,
|
||||
1.40859, 1.41705, 1.42561, 1.43424,
|
||||
1.44296, 1.45177, 1.46068, 1.46967,
|
||||
1.47876, 1.48795, 1.49723, 1.50662,
|
||||
1.51611, 1.52571, 1.53541, 1.54523,
|
||||
1.55517, 1.56522, 1.57539, 1.58568,
|
||||
1.59611, 1.60666, 1.61735, 1.62817,
|
||||
1.63914, 1.65025, 1.66152, 1.67293,
|
||||
1.68451, 1.69625, 1.70815, 1.72023,
|
||||
1.73249, 1.74494, 1.75757, 1.77041,
|
||||
1.78344, 1.79669, 1.81016, 1.82385,
|
||||
1.83777, 1.85194, 1.86635, 1.88103,
|
||||
1.89598, 1.91121, 1.92674, 1.94257,
|
||||
1.95871, 1.97519, 1.99201, 2.0092,
|
||||
2.02676, 2.04471, 2.06309, 2.08189,
|
||||
2.10115, 2.12089, 2.14114, 2.16192,
|
||||
2.18326, 2.2052, 2.22777, 2.25101,
|
||||
2.27496, 2.29966, 2.32518, 2.35156,
|
||||
2.37886, 2.40717, 2.43655, 2.46709,
|
||||
2.49889, 2.53206, 2.56673, 2.60305,
|
||||
2.64117, 2.6813, 2.72367, 2.76854,
|
||||
2.81623, 2.86714, 2.92173, 2.98059,
|
||||
3.04446, 3.1143, 3.19135, 3.27731,
|
||||
3.37455, 3.48653, 3.61862, 3.77982,
|
||||
3.98692, 4.2776, 4.77167, 133.333 };
|
28
modules/features2d/3rdparty/mscr/chi_table_LICENSE.txt
vendored
Normal file
28
modules/features2d/3rdparty/mscr/chi_table_LICENSE.txt
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
License Agreement
|
||||
For chi_table.h
|
||||
|
||||
Copyright (C) 2007 Per-Erik Forssen, all rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistribution's of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistribution's in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* The name of the copyright holders may not be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
This software is provided by the copyright holders and contributors "as is" and
|
||||
any express or implied warranties, including, but not limited to, the implied
|
||||
warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
indirect, incidental, special, exemplary, or consequential damages
|
||||
(including, but not limited to, procurement of substitute goods or services;
|
||||
loss of use, data, or profits; or business interruption) however caused
|
||||
and on any theory of liability, whether in contract, strict liability,
|
||||
or tort (including negligence or otherwise) arising in any way out of
|
||||
the use of this software, even if advised of the possibility of such damage.
|
@ -7,3 +7,5 @@ if(DEBUG_opencv_features2d)
|
||||
list(APPEND debug_modules opencv_highgui)
|
||||
endif()
|
||||
ocv_define_module(features2d opencv_imgproc ${debug_modules} OPTIONAL opencv_flann WRAP java objc python js)
|
||||
|
||||
ocv_install_3rdparty_licenses(mscr "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/mscr/chi_table_LICENSE.txt")
|
||||
|
@ -30,18 +30,23 @@
|
||||
* OpenCV functions for MSER extraction
|
||||
*
|
||||
* 1. there are two different implementation of MSER, one for gray image, one for color image
|
||||
* 2. the gray image algorithm is taken from: Linear Time Maximally Stable Extremal Regions;
|
||||
* 2. the gray image algorithm is taken from:
|
||||
* Linear Time Maximally Stable Extremal Regions;
|
||||
* the paper claims to be faster than union-find method;
|
||||
* it actually get 1.5~2m/s on my centrino L7200 1.2GHz laptop.
|
||||
* 3. the color image algorithm is taken from: Maximally Stable Colour Regions for Recognition and Match;
|
||||
* 3. the color image algorithm is taken from:
|
||||
* Maximally Stable Colour Regions for Recognition and Match;
|
||||
* it should be much slower than gray image method ( 3~4 times );
|
||||
* the chi_table.h file is taken directly from paper's source code which is distributed under permissive BSD-like license: http://users.isy.liu.se/cvl/perfo/software/chi_table.h
|
||||
* the chi_table.h file is taken directly from the paper's source code:
|
||||
* http://users.isy.liu.se/cvl/perfo/software/chi_table.h
|
||||
* license (BSD-like) is located in the file: 3rdparty/mscr/chi_table_LICENSE.txt
|
||||
* 4. though the name is *contours*, the result actually is a list of point set.
|
||||
*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include "opencv2/imgproc/imgproc_c.h"
|
||||
#include <limits>
|
||||
#include "../3rdparty/mscr/chi_table.h"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
@ -613,113 +618,6 @@ the color MSER has not been completely refactored yet. We leave it mostly as-is,
|
||||
with just enough changes to convert C structures to C++ ones and
|
||||
add support for color images into MSER_Impl::detectAndLabel.
|
||||
*/
|
||||
|
||||
const int TABLE_SIZE = 400;
|
||||
|
||||
static const float chitab3[]=
|
||||
{
|
||||
0.f, 0.0150057f, 0.0239478f, 0.0315227f,
|
||||
0.0383427f, 0.0446605f, 0.0506115f, 0.0562786f,
|
||||
0.0617174f, 0.0669672f, 0.0720573f, 0.0770099f,
|
||||
0.081843f, 0.0865705f, 0.0912043f, 0.0957541f,
|
||||
0.100228f, 0.104633f, 0.108976f, 0.113261f,
|
||||
0.117493f, 0.121676f, 0.125814f, 0.12991f,
|
||||
0.133967f, 0.137987f, 0.141974f, 0.145929f,
|
||||
0.149853f, 0.15375f, 0.15762f, 0.161466f,
|
||||
0.165287f, 0.169087f, 0.172866f, 0.176625f,
|
||||
0.180365f, 0.184088f, 0.187794f, 0.191483f,
|
||||
0.195158f, 0.198819f, 0.202466f, 0.2061f,
|
||||
0.209722f, 0.213332f, 0.216932f, 0.220521f,
|
||||
0.2241f, 0.22767f, 0.231231f, 0.234783f,
|
||||
0.238328f, 0.241865f, 0.245395f, 0.248918f,
|
||||
0.252435f, 0.255947f, 0.259452f, 0.262952f,
|
||||
0.266448f, 0.269939f, 0.273425f, 0.276908f,
|
||||
0.280386f, 0.283862f, 0.287334f, 0.290803f,
|
||||
0.29427f, 0.297734f, 0.301197f, 0.304657f,
|
||||
0.308115f, 0.311573f, 0.315028f, 0.318483f,
|
||||
0.321937f, 0.32539f, 0.328843f, 0.332296f,
|
||||
0.335749f, 0.339201f, 0.342654f, 0.346108f,
|
||||
0.349562f, 0.353017f, 0.356473f, 0.35993f,
|
||||
0.363389f, 0.366849f, 0.37031f, 0.373774f,
|
||||
0.377239f, 0.380706f, 0.384176f, 0.387648f,
|
||||
0.391123f, 0.3946f, 0.39808f, 0.401563f,
|
||||
0.405049f, 0.408539f, 0.412032f, 0.415528f,
|
||||
0.419028f, 0.422531f, 0.426039f, 0.429551f,
|
||||
0.433066f, 0.436586f, 0.440111f, 0.44364f,
|
||||
0.447173f, 0.450712f, 0.454255f, 0.457803f,
|
||||
0.461356f, 0.464915f, 0.468479f, 0.472049f,
|
||||
0.475624f, 0.479205f, 0.482792f, 0.486384f,
|
||||
0.489983f, 0.493588f, 0.4972f, 0.500818f,
|
||||
0.504442f, 0.508073f, 0.511711f, 0.515356f,
|
||||
0.519008f, 0.522667f, 0.526334f, 0.530008f,
|
||||
0.533689f, 0.537378f, 0.541075f, 0.54478f,
|
||||
0.548492f, 0.552213f, 0.555942f, 0.55968f,
|
||||
0.563425f, 0.56718f, 0.570943f, 0.574715f,
|
||||
0.578497f, 0.582287f, 0.586086f, 0.589895f,
|
||||
0.593713f, 0.597541f, 0.601379f, 0.605227f,
|
||||
0.609084f, 0.612952f, 0.61683f, 0.620718f,
|
||||
0.624617f, 0.628526f, 0.632447f, 0.636378f,
|
||||
0.64032f, 0.644274f, 0.648239f, 0.652215f,
|
||||
0.656203f, 0.660203f, 0.664215f, 0.668238f,
|
||||
0.672274f, 0.676323f, 0.680384f, 0.684457f,
|
||||
0.688543f, 0.692643f, 0.696755f, 0.700881f,
|
||||
0.70502f, 0.709172f, 0.713339f, 0.717519f,
|
||||
0.721714f, 0.725922f, 0.730145f, 0.734383f,
|
||||
0.738636f, 0.742903f, 0.747185f, 0.751483f,
|
||||
0.755796f, 0.760125f, 0.76447f, 0.768831f,
|
||||
0.773208f, 0.777601f, 0.782011f, 0.786438f,
|
||||
0.790882f, 0.795343f, 0.799821f, 0.804318f,
|
||||
0.808831f, 0.813363f, 0.817913f, 0.822482f,
|
||||
0.827069f, 0.831676f, 0.836301f, 0.840946f,
|
||||
0.84561f, 0.850295f, 0.854999f, 0.859724f,
|
||||
0.864469f, 0.869235f, 0.874022f, 0.878831f,
|
||||
0.883661f, 0.888513f, 0.893387f, 0.898284f,
|
||||
0.903204f, 0.908146f, 0.913112f, 0.918101f,
|
||||
0.923114f, 0.928152f, 0.933214f, 0.938301f,
|
||||
0.943413f, 0.94855f, 0.953713f, 0.958903f,
|
||||
0.964119f, 0.969361f, 0.974631f, 0.979929f,
|
||||
0.985254f, 0.990608f, 0.99599f, 1.0014f,
|
||||
1.00684f, 1.01231f, 1.01781f, 1.02335f,
|
||||
1.02891f, 1.0345f, 1.04013f, 1.04579f,
|
||||
1.05148f, 1.05721f, 1.06296f, 1.06876f,
|
||||
1.07459f, 1.08045f, 1.08635f, 1.09228f,
|
||||
1.09826f, 1.10427f, 1.11032f, 1.1164f,
|
||||
1.12253f, 1.1287f, 1.1349f, 1.14115f,
|
||||
1.14744f, 1.15377f, 1.16015f, 1.16656f,
|
||||
1.17303f, 1.17954f, 1.18609f, 1.19269f,
|
||||
1.19934f, 1.20603f, 1.21278f, 1.21958f,
|
||||
1.22642f, 1.23332f, 1.24027f, 1.24727f,
|
||||
1.25433f, 1.26144f, 1.26861f, 1.27584f,
|
||||
1.28312f, 1.29047f, 1.29787f, 1.30534f,
|
||||
1.31287f, 1.32046f, 1.32812f, 1.33585f,
|
||||
1.34364f, 1.3515f, 1.35943f, 1.36744f,
|
||||
1.37551f, 1.38367f, 1.39189f, 1.4002f,
|
||||
1.40859f, 1.41705f, 1.42561f, 1.43424f,
|
||||
1.44296f, 1.45177f, 1.46068f, 1.46967f,
|
||||
1.47876f, 1.48795f, 1.49723f, 1.50662f,
|
||||
1.51611f, 1.52571f, 1.53541f, 1.54523f,
|
||||
1.55517f, 1.56522f, 1.57539f, 1.58568f,
|
||||
1.59611f, 1.60666f, 1.61735f, 1.62817f,
|
||||
1.63914f, 1.65025f, 1.66152f, 1.67293f,
|
||||
1.68451f, 1.69625f, 1.70815f, 1.72023f,
|
||||
1.73249f, 1.74494f, 1.75757f, 1.77041f,
|
||||
1.78344f, 1.79669f, 1.81016f, 1.82385f,
|
||||
1.83777f, 1.85194f, 1.86635f, 1.88103f,
|
||||
1.89598f, 1.91121f, 1.92674f, 1.94257f,
|
||||
1.95871f, 1.97519f, 1.99201f, 2.0092f,
|
||||
2.02676f, 2.04471f, 2.06309f, 2.08189f,
|
||||
2.10115f, 2.12089f, 2.14114f, 2.16192f,
|
||||
2.18326f, 2.2052f, 2.22777f, 2.25101f,
|
||||
2.27496f, 2.29966f, 2.32518f, 2.35156f,
|
||||
2.37886f, 2.40717f, 2.43655f, 2.46709f,
|
||||
2.49889f, 2.53206f, 2.56673f, 2.60305f,
|
||||
2.64117f, 2.6813f, 2.72367f, 2.76854f,
|
||||
2.81623f, 2.86714f, 2.92173f, 2.98059f,
|
||||
3.04446f, 3.1143f, 3.19135f, 3.27731f,
|
||||
3.37455f, 3.48653f, 3.61862f, 3.77982f,
|
||||
3.98692f, 4.2776f, 4.77167f, 133.333f
|
||||
};
|
||||
|
||||
struct MSCRNode;
|
||||
|
||||
struct TempMSCR
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user