From b6cc74f85546f60b5eb5486947e3be121ca54a28 Mon Sep 17 00:00:00 2001 From: MengqingCao Date: Tue, 19 Dec 2023 17:35:11 +0800 Subject: [PATCH 01/57] link libacl_dvpp_mpi library when building with CANN backend --- cmake/OpenCVFindCANN.cmake | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cmake/OpenCVFindCANN.cmake b/cmake/OpenCVFindCANN.cmake index 913c1887e7..36d160d0f4 100644 --- a/cmake/OpenCVFindCANN.cmake +++ b/cmake/OpenCVFindCANN.cmake @@ -57,6 +57,18 @@ if(CANN_INSTALL_DIR) set(HAVE_CANN OFF) return() endif() + + # * libacl_dvpp_mpi.so + set(libacl_dvpp_mpi "${CANN_INSTALL_DIR}/lib64") + find_library(found_libacldvppmpi NAMES acl_dvpp_mpi PATHS ${libacl_dvpp_mpi} NO_DEFAULT_PATH) + if(found_libacldvppmpi) + set(libacl_dvpp_mpi ${found_libacldvppmpi}) + message(STATUS "CANN: libacl_dvpp_mpi.so is found at ${libacl_dvpp_mpi}") + else() + message(STATUS "CANN: Missing libacl_dvpp_mpi.so. Turning off HAVE_CANN") + set(HAVE_CANN OFF) + return() + endif() # * libgraph.so set(lib_graph "${CANN_INSTALL_DIR}/compiler/lib64") find_library(found_lib_graph NAMES graph PATHS ${lib_graph} NO_DEFAULT_PATH) @@ -105,6 +117,7 @@ if(CANN_INSTALL_DIR) list(APPEND libs_cann ${lib_opsproto}) list(APPEND libs_cann ${lib_graph}) list(APPEND libs_cann ${lib_ge_compiler}) + list(APPEND libs_cann ${libacl_dvpp_mpi}) # * lib_graph_base.so if(NOT CANN_VERSION_BELOW_6_3_ALPHA002) From a30c987f8785e4f674752b8b5425d0e9927ebfad Mon Sep 17 00:00:00 2001 From: llh721113 Date: Mon, 26 Jun 2023 20:37:34 +0800 Subject: [PATCH 02/57] feat: RVP052 Optimization for DNN int8layers --- .../dnn/src/int8layers/convolution_layer.cpp | 13 ++ .../src/int8layers/fully_connected_layer.cpp | 5 + modules/dnn/src/int8layers/layers_common.hpp | 2 + modules/dnn/src/int8layers/layers_rvp052.cpp | 221 ++++++++++++++++++ modules/dnn/src/int8layers/layers_rvp052.hpp | 36 +++ .../linux/riscv64-andes-gcc.toolchain.cmake | 10 + 6 files changed, 287 insertions(+) create mode 100644 modules/dnn/src/int8layers/layers_rvp052.cpp create mode 100644 modules/dnn/src/int8layers/layers_rvp052.hpp create mode 100755 platforms/linux/riscv64-andes-gcc.toolchain.cmake diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp index ba9b31fe35..6121e971a2 100644 --- a/modules/dnn/src/int8layers/convolution_layer.cpp +++ b/modules/dnn/src/int8layers/convolution_layer.cpp @@ -969,6 +969,13 @@ public: stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp); else + #endif + #if CV_RVP052 + if(isConv2D) + opt_RVP052::fastDepthwiseConv(wptr, kernel_h, kernel_w, + stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, + biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp); + else #endif { const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], @@ -1348,6 +1355,12 @@ public: opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn); else + #endif + #if CV_RVP052 + if(isConv2D) + opt_RVP052::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, + outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn); + else #endif for( int i = 0; i < outCn; i += 2 ) { diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp index ba5b0d79c1..3a560ddda6 100644 --- a/modules/dnn/src/int8layers/fully_connected_layer.cpp +++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp @@ -302,6 +302,11 @@ public: if( useLASX ) opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp ); else + #endif + #if CV_RVP052 + if( 1 ) + opt_RVP052::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp ); + else #endif { int i = 0; diff --git a/modules/dnn/src/int8layers/layers_common.hpp b/modules/dnn/src/int8layers/layers_common.hpp index 5fdafbeab8..4612feed48 100644 --- a/modules/dnn/src/int8layers/layers_common.hpp +++ b/modules/dnn/src/int8layers/layers_common.hpp @@ -13,6 +13,8 @@ #include "int8layers/layers_common.simd_declarations.hpp" #undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +#include "./layers_rvp052.hpp" + #ifdef HAVE_OPENCL #include "../ocl4dnn/include/ocl4dnn.hpp" #endif diff --git a/modules/dnn/src/int8layers/layers_rvp052.cpp b/modules/dnn/src/int8layers/layers_rvp052.cpp new file mode 100644 index 0000000000..628882a43f --- /dev/null +++ b/modules/dnn/src/int8layers/layers_rvp052.cpp @@ -0,0 +1,221 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../precomp.hpp" +#include "./layers_rvp052.hpp" + +#if CV_RVP052 + +namespace cv { +namespace dnn { +namespace opt_RVP052 { + +void fastConv(const int8_t *weights, size_t wstep, const int *bias, + const int8_t *rowbuf, int *output, const int *outShape, + int blockSize, int vecsize, int vecsize_aligned, int outZp, + const float *multiplier, bool initOutput, bool finalOutput) +{ + int outCn = outShape[1]; + size_t outPlaneSize = outShape[2] * outShape[3]; + for (int i = 0; i < outCn; i += 2) + { + const int8_t *wptr0 = weights + i * wstep; + const int8_t *wptr1 = wptr0 + wstep; + int *outptr0 = output + i * outPlaneSize; + int *outptr1 = outptr0 + outPlaneSize; + int bias0 = bias[i], bias1 = bias[i + 1]; + float mult0 = multiplier[i], mult1 = multiplier[i + 1]; + + if (i + 1 >= outCn) + { + wptr1 = wptr0; + outptr1 = outptr0; + bias1 = bias0; + mult1 = mult0; + } + int j = 0; + for (; j < blockSize; j++) + { + const int8_t *rptr = rowbuf + j * vecsize_aligned; + int s00 = initOutput ? bias0 : outptr0[j]; + int s10 = initOutput ? bias1 : outptr1[j]; + + int32x2_t vsx0 = {s00, s10}; + + for (int k = 0; k < vecsize; k += 4) + { + int8x4_t vrptr[2] = {*(int8x4_t*)(rptr + k), *(int8x4_t*)(rptr + k)}; + int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)}; + vsx0 = __nds__v_smaqa(vsx0, *(int8x8_t*)vwptr, *(int8x8_t*)vrptr); + } + + if (finalOutput) + { + vsx0[0] = outZp + (int)std::round(vsx0[0] * mult0); + vsx0[1] = outZp + (int)std::round(vsx0[1] * mult1); + vsx0 = __nds__v_sclip32(vsx0, 7); + } + + outptr0[j] = vsx0[0]; + outptr1[j] = vsx0[1]; + } + } +} + +void fastDepthwiseConv(const int8_t *wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const int *biasptr, const float *multptr, + const int8_t *inptr_, + int height, int width, + int *outptr_, + int out_d, int outH, int outW, + int inpZp, int outZp) +{ + const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; + int outW1 = min(outW, (width - dilation_w * (kernel_w - 1) + pad_l) / stride_w); + int bias = biasptr[out_d], biasCopy; + float mult = multptr[out_d]; + + for (int out_i = 0; out_i < outH; out_i++) + { + int in_i = out_i * stride_h - pad_t, out_j = 0; + const int8_t *imgptr0 = inptr_ + in_i * width; + const int8_t *imgptr1 = imgptr0 + dilation_h * width; + const int8_t *imgptr2 = imgptr0 + (dilation_h * 2) * width; + int8_t w00 = w00_, w01 = w01_, w02 = w02_; + int8_t w20 = w20_, w21 = w21_, w22 = w22_; + int out; + biasCopy = bias; + + if (in_i < 0) + { + biasCopy += inpZp * (w00 + w01 + w02); + w00 = w01 = w02 = 0; + imgptr0 = imgptr1; + } + else if (in_i + dilation_h * (kernel_h - 1) >= height) + { + biasCopy += inpZp * (w20 + w21 + w22); + w20 = w21 = w22 = 0; + imgptr2 = imgptr1; + } + int *outptr = outptr_ + out_i * outW; + if (pad_l > 0) + { + out = (int)imgptr0[0] * w01 + (int)imgptr0[dilation_w] * w02 + + (int)imgptr1[0] * w11 + (int)imgptr1[dilation_w] * w12 + + (int)imgptr2[0] * w21 + (int)imgptr2[dilation_w] * w22 + + biasCopy + inpZp * (w00 + w10 + w20); + outptr[0] = __nds__sclip32(outZp + (int)std::round(out * mult), 7); + out_j = 1; + } + + int8x8_t vwx0 = (int8x8_t){w00, w10, w20, 0, w00, w10, w20, 0}; + int8x8_t vwx1 = (int8x8_t){w01, w11, w21, 0, w01, w11, w21, 0}; + int8x8_t vwx2 = (int8x8_t){w02, w12, w22, 0, w02, w12, w22, 0}; + int8x8_t vimgx0, vimgx1, vimgx2; + int32x2_t vout = {0, 0}; + for (; out_j < outW1; out_j+=2) + { + int in_j = out_j * stride_w - pad_l; + vimgx0 = (int8x8_t){imgptr0[in_j], imgptr1[in_j], imgptr2[in_j], 0, + imgptr0[in_j + stride_w], imgptr1[in_j + stride_w], imgptr2[in_j + stride_w], 0}; + vimgx1 = (int8x8_t){imgptr0[in_j + dilation_w], imgptr1[in_j + dilation_w], imgptr2[in_j + dilation_w], 0, + imgptr0[in_j + dilation_w + stride_w], imgptr1[in_j + dilation_w + stride_w], imgptr2[in_j + dilation_w + stride_w], 0}; + vimgx2 = (int8x8_t){imgptr0[in_j + dilation_w * 2], imgptr1[in_j + dilation_w * 2], imgptr2[in_j + dilation_w * 2], 0, + imgptr0[in_j + dilation_w * 2 + stride_w], imgptr1[in_j + dilation_w * 2 + stride_w], imgptr2[in_j + dilation_w * 2 + stride_w], 0}; + + vout = (int32x2_t){biasCopy, biasCopy}; + vout = __nds__v_smaqa(vout, vwx0, vimgx0); + vout = __nds__v_smaqa(vout, vwx1, vimgx1); + vout = __nds__v_smaqa(vout, vwx2, vimgx2); + + outptr[out_j] = __nds__sclip32(outZp + (int)std::round(vout[0] * mult), 7); + outptr[out_j + 1] = __nds__sclip32(outZp + (int)std::round(vout[1] * mult), 7); + } + + while (out_j > outW1) out_j--; + + for (; out_j < outW; out_j++) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w * 2; + int s0 = 1, s1 = 1, s2 = 1; + if (in_j0 >= width) + { + in_j0 = 0; + s0 = 0; + biasCopy += inpZp * (w00 + w10 + w20); + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0; + biasCopy += inpZp * (w01 + w11 + w21); + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0; + biasCopy += inpZp * (w02 + w12 + w22); + } + out = (int)imgptr0[in_j0] * w00 * s0 + (int)imgptr0[in_j1] * w01 * s1 + (int)imgptr0[in_j2] * w02 * s2 + + (int)imgptr1[in_j0] * w10 * s0 + (int)imgptr1[in_j1] * w11 * s1 + (int)imgptr1[in_j2] * w12 * s2 + + (int)imgptr2[in_j0] * w20 * s0 + (int)imgptr2[in_j1] * w21 * s1 + (int)imgptr2[in_j2] * w22 * s2 + biasCopy; + outptr[out_j] = __nds__sclip32(outZp + (int)std::round(out * mult), 7); + } + } +} + +// dst = vec * weights^t + bias +void fastGEMM1T( const int8_t* vec, const int8_t* weights, + size_t wstep, const int* bias, const float* multiplier, + int* dst, int nvecs, int vecsize, int outZp ) +{ + int i = 0; + + for( ; i <= nvecs - 2; i += 2 ) + { + const int8_t* wptr0 = weights + i * wstep; + const int8_t* wptr1 = weights + (i + 1) * wstep; + + int32x2_t vs0 = *(int32x2_t*)(bias + i); + + for( int k = 0; k < vecsize; k += 4 ) + { + int8x4_t vvec[2] = {*(int8x4_t*)(vec + k), *(int8x4_t*)(vec + k)}; + int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)}; + vs0 = __nds__v_smaqa(vs0, *(int8x8_t*)vwptr, *(int8x8_t*)vvec); + } + + int32x2_t vdst = {(int)std::round(vs0[0] * multiplier[i]), (int)std::round(vs0[1] * multiplier[i + 1])}; + + vdst = __nds__v_sclip32(vdst + outZp, 7); + + *(int32x2_t*)(dst + i) = vdst; + } + + for( ; i < nvecs; i++ ) + { + const int8_t* wptr = weights + i * wstep; + int s0 = bias[i]; + + for( int k = 0; k < vecsize; k += 4 ) + { + int8x4_t vvec[2] = {*(int8x4_t*)(vec + k), 0}; + int8x4_t vwptr[2] = {*(int8x4_t*)(wptr + k), 0}; + s0 = __nds__smaqa(s0, *(unsigned long*)vwptr, *(unsigned long*)vvec); + } + + dst[i] = __nds__sclip32(outZp + (int)std::round(s0 * multiplier[i]), 7); + } +} + +}}} // namespace + +#endif diff --git a/modules/dnn/src/int8layers/layers_rvp052.hpp b/modules/dnn/src/int8layers/layers_rvp052.hpp new file mode 100644 index 0000000000..c956caf20c --- /dev/null +++ b/modules/dnn/src/int8layers/layers_rvp052.hpp @@ -0,0 +1,36 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#if defined(__riscv) && defined(__riscv_dsp) && defined(__ANDES) +# include +# define CV_RVP052 1 + +namespace cv { +namespace dnn { +namespace opt_RVP052 { + +void fastConv( const int8_t* weights, size_t wstep, const int* bias, + const int8_t* rowbuf, int* output, const int* outShape, + int blockSize, int vecsize, int vecsize_aligned, int outZp, + const float* multiplier, bool initOutput, bool finalOutput ); +void fastDepthwiseConv( const int8_t* wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const int* biasptr, const float* multptr, + const int8_t* inptr_, + int height, int width, + int* outptr_, + int out_d, int outH, int outW, + int inpZp, int outZp ); +void fastGEMM1T( const int8_t* vec, const int8_t* weights, + size_t wstep, const int* bias, const float* multiplier, + int* dst, int nvecs, int vecsize, int outZp ); + +}}} + +#else +# define CV_RVP052 0 +#endif diff --git a/platforms/linux/riscv64-andes-gcc.toolchain.cmake b/platforms/linux/riscv64-andes-gcc.toolchain.cmake new file mode 100755 index 0000000000..ce733fc790 --- /dev/null +++ b/platforms/linux/riscv64-andes-gcc.toolchain.cmake @@ -0,0 +1,10 @@ +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR riscv64) + +set(RISCV_GCC_INSTALL_ROOT $ENV{RISCV} CACHE PATH "Path to GCC for RISC-V cross compiler installation directory") + +set(CMAKE_C_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-gcc) +set(CMAKE_CXX_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-g++) + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gc -mext-dsp") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc -mext-dsp") From cd005752577bb9c5a2c4a4f60a2fb416b7520de5 Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Fri, 22 Dec 2023 18:24:23 +0900 Subject: [PATCH 03/57] brush up --- cmake/OpenCVCompilerOptimizations.cmake | 28 ++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index ea577e4a1c..ff0e40c666 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -484,21 +484,19 @@ macro(ocv_check_compiler_optimization OPT) endmacro() macro(ocv_cpu_aarch64_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRING COMMON_OPTION) - if(NOT MSVC) - unset(_POSTFIX) - # Check each feature option - foreach(OPT IN LISTS ${FEATURE_NAME_LIST}) - string(FIND "${${FLAG_STRING}}" "${CPU_${OPT}_FLAGS_ON}" OPT_FOUND) - if(NOT ${OPT_FOUND} EQUAL -1) - string(REPLACE "${COMMON_OPTION}" "" TRAILING_PART "${CPU_${OPT}_FLAGS_ON}") - string(APPEND _POSTFIX "${TRAILING_PART}") - string(REPLACE " ${CPU_${OPT}_FLAGS_ON}" "" ${FLAG_STRING} ${${FLAG_STRING}}) - endif() - endforeach() - # If more than one option found, merge them - if(NOT "x${_POSTFIX}" STREQUAL "x") - set(${FLAG_STRING} "${${FLAG_STRING}} ${COMMON_OPTION}${_POSTFIX}") + unset(_POSTFIX) + # Check each feature option + foreach(OPT IN LISTS ${FEATURE_NAME_LIST}) + string(FIND "${${FLAG_STRING}}" "${CPU_${OPT}_FLAGS_ON}" OPT_FOUND) + if(NOT ${OPT_FOUND} EQUAL -1) + string(REPLACE "${COMMON_OPTION}" "" TRAILING_PART "${CPU_${OPT}_FLAGS_ON}") + string(APPEND _POSTFIX "${TRAILING_PART}") + string(REPLACE " ${CPU_${OPT}_FLAGS_ON}" "" ${FLAG_STRING} ${${FLAG_STRING}}) endif() + endforeach() + # If more than one option found, merge them + if(NOT "x${_POSTFIX}" STREQUAL "x") + set(${FLAG_STRING} "${${FLAG_STRING}} ${COMMON_OPTION}${_POSTFIX}") endif() endmacro() @@ -596,10 +594,12 @@ foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS}) endforeach() if(AARCH64) + if(NOT MSVC) # Define the list of NEON options to check set(NEON_OPTIONS_LIST NEON_DOTPROD NEON_FP16 NEON_BF16) set(BASE_ARCHITECTURE "-march=armv8.2-a") ocv_cpu_aarch64_baseline_merge_feature_options(NEON_OPTIONS_LIST CPU_BASELINE_FLAGS ${BASE_ARCHITECTURE}) + endif() endif() foreach(OPT ${CPU_BASELINE_REQUIRE}) From 95b84a0a9bf63d2a719f8440ff6bc1940369513e Mon Sep 17 00:00:00 2001 From: Yusuke Kameda Date: Fri, 22 Dec 2023 21:22:52 +0900 Subject: [PATCH 04/57] Update applyColorMap document --- modules/imgproc/include/opencv2/imgproc.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp index c95f1ada1d..88dd2e7f56 100644 --- a/modules/imgproc/include/opencv2/imgproc.hpp +++ b/modules/imgproc/include/opencv2/imgproc.hpp @@ -4470,7 +4470,7 @@ An example using applyColorMap function /** @brief Applies a GNU Octave/MATLAB equivalent colormap on a given image. -@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. +@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. If CV_8UC3, then the CV_8UC1 image is generated internally using cv::COLOR_BGR2GRAY. @param dst The result is the colormapped source image. Note: Mat::create is called on dst. @param colormap The colormap to apply, see #ColormapTypes */ @@ -4478,8 +4478,8 @@ CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, int colormap); /** @brief Applies a user colormap on a given image. -@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. -@param dst The result is the colormapped source image. Note: Mat::create is called on dst. +@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. If CV_8UC3, then the CV_8UC1 image is generated internally using cv::COLOR_BGR2GRAY. +@param dst The result is the colormapped source image of the same number of channels as userColor. Note: Mat::create is called on dst. @param userColor The colormap to apply of type CV_8UC1 or CV_8UC3 and size 256 */ CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, InputArray userColor); From 0f1c484ea9974f9236a66bf9d9799a35f0c1191d Mon Sep 17 00:00:00 2001 From: Letu Ren Date: Wed, 27 Dec 2023 22:26:36 +0800 Subject: [PATCH 05/57] Remove deprecated CMake variable CMake variable `CMAKE_COMPILER_IS_CCACHE` is marked as deprecated and "# FIXIT Avoid setting of CMAKE_ variables". It is introduced in https://github.com/opencv/opencv/pull/11167 which is before the release of 4.0. Since it's deprecated almost six years ago, I think it's OK to remove them. Related: https://github.com/opencv/opencv/pull/24771 --- cmake/OpenCVCompilerOptions.cmake | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 427189c079..c3715fafb5 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -1,13 +1,6 @@ if("${CMAKE_CXX_COMPILER};${CMAKE_C_COMPILER};${CMAKE_CXX_COMPILER_LAUNCHER}" MATCHES "ccache") - set(CMAKE_COMPILER_IS_CCACHE 1) # TODO: FIXIT Avoid setting of CMAKE_ variables set(OPENCV_COMPILER_IS_CCACHE 1) endif() -function(access_CMAKE_COMPILER_IS_CCACHE) - if(NOT OPENCV_SUPPRESS_DEPRECATIONS) - message(WARNING "DEPRECATED: CMAKE_COMPILER_IS_CCACHE is replaced to OPENCV_COMPILER_IS_CCACHE.") - endif() -endfunction() -variable_watch(CMAKE_COMPILER_IS_CCACHE access_CMAKE_COMPILER_IS_CCACHE) if(ENABLE_CCACHE AND NOT OPENCV_COMPILER_IS_CCACHE) # This works fine with Unix Makefiles and Ninja generators find_host_program(CCACHE_PROGRAM ccache) From e5d1309fcfd8d60371883417f7401e59a085ddfd Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 28 Dec 2023 16:42:01 +0300 Subject: [PATCH 06/57] Fixed typo in Android build scripts. --- samples/android/15-puzzle/build.gradle.in | 2 +- samples/android/camera-calibration/build.gradle.in | 2 +- samples/android/color-blob-detection/build.gradle.in | 2 +- samples/android/face-detection/build.gradle.in | 2 +- samples/android/image-manipulations/build.gradle.in | 2 +- samples/android/mobilenet-objdetect/build.gradle.in | 2 +- samples/android/qr-detection/build.gradle.in | 2 +- samples/android/tutorial-1-camerapreview/build.gradle.in | 2 +- samples/android/tutorial-2-mixedprocessing/build.gradle.in | 4 ++-- samples/android/tutorial-3-cameracontrol/build.gradle.in | 2 +- samples/android/tutorial-4-opencl/build.gradle.in | 4 ++-- samples/android/video-recorder/build.gradle.in | 2 +- 12 files changed, 14 insertions(+), 14 deletions(-) diff --git a/samples/android/15-puzzle/build.gradle.in b/samples/android/15-puzzle/build.gradle.in index bf8921b98d..b2a8975dce 100644 --- a/samples/android/15-puzzle/build.gradle.in +++ b/samples/android/15-puzzle/build.gradle.in @@ -31,7 +31,7 @@ dependencies { if (gradle.opencv_source == 'sdk_path') { println 'Using OpenCV from SDK' implementation project(':opencv') - } else if (gradle.opencv_source == 'maven_local' || gradle.opencv_source == 'maven_cenral') { + } else if (gradle.opencv_source == 'maven_local' || gradle.opencv_source == 'maven_central') { println 'Using OpenCV from Maven repo' implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@' } diff --git a/samples/android/camera-calibration/build.gradle.in b/samples/android/camera-calibration/build.gradle.in index d79df7777e..3cd3d9200e 100644 --- a/samples/android/camera-calibration/build.gradle.in +++ b/samples/android/camera-calibration/build.gradle.in @@ -31,7 +31,7 @@ dependencies { if (gradle.opencv_source == "sdk_path") { println 'Using OpenCV from SDK' implementation project(':opencv') - } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { println 'Using OpenCV from Maven repo' implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@' } diff --git a/samples/android/color-blob-detection/build.gradle.in b/samples/android/color-blob-detection/build.gradle.in index 6d544592a4..b0c50859f9 100644 --- a/samples/android/color-blob-detection/build.gradle.in +++ b/samples/android/color-blob-detection/build.gradle.in @@ -31,7 +31,7 @@ dependencies { if (gradle.opencv_source == "sdk_path") { println 'Using OpenCV from from SDK' implementation project(':opencv') - } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { println 'Using OpenCV from Maven repo' implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@' } diff --git a/samples/android/face-detection/build.gradle.in b/samples/android/face-detection/build.gradle.in index 6cc9d8cfb4..6fc4ce26c7 100644 --- a/samples/android/face-detection/build.gradle.in +++ b/samples/android/face-detection/build.gradle.in @@ -31,7 +31,7 @@ dependencies { if (gradle.opencv_source == "sdk_path") { println 'Using OpenCV from from SDK' implementation project(':opencv') - } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { println 'Using OpenCV from Maven repo' implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@' } diff --git a/samples/android/image-manipulations/build.gradle.in b/samples/android/image-manipulations/build.gradle.in index a227d548cf..b12701f377 100644 --- a/samples/android/image-manipulations/build.gradle.in +++ b/samples/android/image-manipulations/build.gradle.in @@ -31,7 +31,7 @@ dependencies { if (gradle.opencv_source == "sdk_path") { println 'Using OpenCV from from SDK' implementation project(':opencv') - } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { println 'Using OpenCV from Maven repo' implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@' } diff --git a/samples/android/mobilenet-objdetect/build.gradle.in b/samples/android/mobilenet-objdetect/build.gradle.in index 4cb1789e19..377d151d1b 100644 --- a/samples/android/mobilenet-objdetect/build.gradle.in +++ b/samples/android/mobilenet-objdetect/build.gradle.in @@ -31,7 +31,7 @@ dependencies { if (gradle.opencv_source == "sdk_path") { println 'Using OpenCV from SDK' implementation project(':opencv') - } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { println 'Using OpenCV from Maven repo' implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@' } diff --git a/samples/android/qr-detection/build.gradle.in b/samples/android/qr-detection/build.gradle.in index 274f0b4129..469dea609c 100644 --- a/samples/android/qr-detection/build.gradle.in +++ b/samples/android/qr-detection/build.gradle.in @@ -30,7 +30,7 @@ dependencies { //implementation fileTree(dir: 'libs', include: ['*.jar']) if (gradle.opencv_source == "sdk_path") { implementation project(':opencv') - } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@' } } diff --git a/samples/android/tutorial-1-camerapreview/build.gradle.in b/samples/android/tutorial-1-camerapreview/build.gradle.in index a0d44eaf9a..deb9cf1b03 100644 --- a/samples/android/tutorial-1-camerapreview/build.gradle.in +++ b/samples/android/tutorial-1-camerapreview/build.gradle.in @@ -31,7 +31,7 @@ dependencies { if (gradle.opencv_source == "sdk_path") { println 'Using OpenCV from SDK' implementation project(':opencv') - } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { println 'Using OpenCV from Maven repo' implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@' } diff --git a/samples/android/tutorial-2-mixedprocessing/build.gradle.in b/samples/android/tutorial-2-mixedprocessing/build.gradle.in index 4125d65a38..e0f0a6b3c7 100644 --- a/samples/android/tutorial-2-mixedprocessing/build.gradle.in +++ b/samples/android/tutorial-2-mixedprocessing/build.gradle.in @@ -44,7 +44,7 @@ android { } } buildFeatures { - if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { prefab true } } @@ -55,7 +55,7 @@ dependencies { if (gradle.opencv_source == "sdk_path") { println 'Using OpenCV from SDK' implementation project(':opencv') - } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { println 'Using OpenCV from Maven repo' implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@' } diff --git a/samples/android/tutorial-3-cameracontrol/build.gradle.in b/samples/android/tutorial-3-cameracontrol/build.gradle.in index b7dffe86c5..d83f37d74e 100644 --- a/samples/android/tutorial-3-cameracontrol/build.gradle.in +++ b/samples/android/tutorial-3-cameracontrol/build.gradle.in @@ -31,7 +31,7 @@ dependencies { if (gradle.opencv_source == "sdk_path") { println 'Using OpenCV from SDK' implementation project(':opencv') - } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { println 'Using OpenCV from Maven repo' implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@' } diff --git a/samples/android/tutorial-4-opencl/build.gradle.in b/samples/android/tutorial-4-opencl/build.gradle.in index 4675156df2..d9c8cda60e 100644 --- a/samples/android/tutorial-4-opencl/build.gradle.in +++ b/samples/android/tutorial-4-opencl/build.gradle.in @@ -46,7 +46,7 @@ android { } } buildFeatures { - if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { prefab true } } @@ -57,7 +57,7 @@ dependencies { if (gradle.opencv_source == "sdk_path") { println 'Using OpenCV from SDK' implementation project(':opencv') - } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { println 'Using OpenCV from Maven repo' implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@' } diff --git a/samples/android/video-recorder/build.gradle.in b/samples/android/video-recorder/build.gradle.in index b279623803..506d98ce44 100644 --- a/samples/android/video-recorder/build.gradle.in +++ b/samples/android/video-recorder/build.gradle.in @@ -30,7 +30,7 @@ dependencies { //implementation fileTree(dir: 'libs', include: ['*.jar']) if (gradle.opencv_source == "sdk_path") { implementation project(':opencv') - } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") { + } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") { implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@' } } From 6e3685d7338e2826caf56f24c9eeba1187785ed5 Mon Sep 17 00:00:00 2001 From: Letu Ren Date: Fri, 29 Dec 2023 00:16:52 +0800 Subject: [PATCH 07/57] Fix avif version detection Currently, even though libavif is found, the version of libavif will not be printed. `find_package(libavif QUIET)` will set `libavif_VERSION` according to cmake documentation https://cmake.org/cmake/help/latest/command/find_package.html#config-mode-version-selection This patch has been tested on ArchLinux with libavif 1.0.3 installed. --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5da9c2a695..d0b9c7d194 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1452,8 +1452,8 @@ if(WITH_WEBP OR HAVE_WEBP) endif() if(WITH_AVIF OR HAVE_AVIF) - if(AVIF_VERSION) - status(" AVIF:" AVIF_FOUND THEN "${AVIF_LIBRARY} (ver ${AVIF_VERSION})" ELSE "NO") + if(libavif_VERSION) + status(" AVIF:" AVIF_FOUND THEN "${AVIF_LIBRARY} (ver ${libavif_VERSION})" ELSE "NO") else() status(" AVIF:" AVIF_FOUND THEN "${AVIF_LIBRARY}" ELSE "NO") endif() From 3b26e183cbc1c85c43473656794f2df9578c64b6 Mon Sep 17 00:00:00 2001 From: Abduragim Shtanchaev Date: Wed, 27 Dec 2023 19:05:24 +0300 Subject: [PATCH 08/57] changed weights of yolov7 --- modules/dnn/test/test_onnx_importer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index 080b07243c..457b151ccf 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -2797,7 +2797,7 @@ TEST_P(Test_ONNX_nets, YOLOv7) CV_TEST_TAG_DEBUG_VERYLONG ); - std::string weightPath = _tf("models/yolov7_not_simplified.onnx", false); + std::string weightPath = _tf("models/yolov7.onnx", false); // Reference, which is collected with input size of 640x640 std::vector refClassIds{1, 16, 7}; std::vector refScores{0.9614331f, 0.9589417f, 0.8679074f}; From 46b3a504cf5d1f3219f0f7cbfeafe3abcc29eff5 Mon Sep 17 00:00:00 2001 From: Kumataro Date: Fri, 29 Dec 2023 12:58:24 +0900 Subject: [PATCH 09/57] imgcodecs: suppress warning at test_avif.cpp --- modules/imgcodecs/test/test_avif.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/imgcodecs/test/test_avif.cpp b/modules/imgcodecs/test/test_avif.cpp index 99b8f7769c..72b7f54fea 100644 --- a/modules/imgcodecs/test/test_avif.cpp +++ b/modules/imgcodecs/test/test_avif.cpp @@ -166,7 +166,7 @@ TEST_P(Imgcodecs_Avif_Image_EncodeDecodeSuite, imencode_imdecode) { cv::Exception); return; } - bool result; + bool result = true; EXPECT_NO_THROW( result = cv::imencode(".avif", img_original, buf, encoding_params_);); EXPECT_TRUE(result); From 21b6f06f9da43bd484bcaa66b3b4680d009edbee Mon Sep 17 00:00:00 2001 From: Yuriy Chernyshov Date: Fri, 29 Dec 2023 09:55:25 +0100 Subject: [PATCH 10/57] Use normal slash in #include --- modules/videoio/src/cap_winrt_bridge.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/videoio/src/cap_winrt_bridge.hpp b/modules/videoio/src/cap_winrt_bridge.hpp index a1e134e6ab..b78f8544bb 100644 --- a/modules/videoio/src/cap_winrt_bridge.hpp +++ b/modules/videoio/src/cap_winrt_bridge.hpp @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include @@ -114,4 +114,4 @@ private: cv::Mat backInputMat; int deviceIndex, width, height; -}; \ No newline at end of file +}; From 7f2c14fc4ff276193682baea80d0640c5c7d707a Mon Sep 17 00:00:00 2001 From: Rageking8 <106309953+Rageking8@users.noreply.github.com> Date: Fri, 29 Dec 2023 21:34:16 +0800 Subject: [PATCH 11/57] Correct invalid error directive --- modules/core/include/opencv2/core/quaternion.inl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/core/include/opencv2/core/quaternion.inl.hpp b/modules/core/include/opencv2/core/quaternion.inl.hpp index b901ecbc68..4204806a82 100644 --- a/modules/core/include/opencv2/core/quaternion.inl.hpp +++ b/modules/core/include/opencv2/core/quaternion.inl.hpp @@ -28,7 +28,7 @@ #define OPENCV_CORE_QUATERNION_INL_HPP #ifndef OPENCV_CORE_QUATERNION_HPP -#erorr This is not a standalone header. Include quaternion.hpp instead. +#error This is not a standalone header. Include quaternion.hpp instead. #endif //@cond IGNORE From 10f6cbf41c61d11abc93ea19381c60ad3ced14da Mon Sep 17 00:00:00 2001 From: cudawarped <12133430+cudawarped@users.noreply.github.com> Date: Fri, 29 Dec 2023 16:57:17 +0200 Subject: [PATCH 12/57] Fix 24789 --- cmake/OpenCVDetectCUDA.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake index e0c539b90a..06998400d7 100644 --- a/cmake/OpenCVDetectCUDA.cmake +++ b/cmake/OpenCVDetectCUDA.cmake @@ -136,11 +136,11 @@ macro(ocv_check_windows_crt_linkage) cmake_policy(GET CMP0091 MSVC_RUNTIME_SET_BY_ABSTRACTION) if(MSVC_RUNTIME_SET_BY_ABSTRACTION STREQUAL "NEW") if(NOT BUILD_SHARED_LIBS AND BUILD_WITH_STATIC_CRT) - set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} " /MT") - set(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} " /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd") else() - set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} " /MD") - set(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} " /MDd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MD") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MDd") endif() endif() endif() From a25cfb463d272fc1ea6941d1530b2df13944d270 Mon Sep 17 00:00:00 2001 From: Ahmed Essam Date: Mon, 1 Jan 2024 00:27:09 +0200 Subject: [PATCH 13/57] Better error message for missing gstreamer plugin --- modules/videoio/src/cap_gstreamer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/videoio/src/cap_gstreamer.cpp b/modules/videoio/src/cap_gstreamer.cpp index cdaccabe45..41e98794b9 100644 --- a/modules/videoio/src/cap_gstreamer.cpp +++ b/modules/videoio/src/cap_gstreamer.cpp @@ -2820,7 +2820,8 @@ void handleMessage(GstElement * pipeline) if (gst_is_missing_plugin_message(msg)) { - CV_WARN("your GStreamer installation is missing a required plugin"); + CV_WARN("your GStreamer installation is missing a required plugin: " << + gst_missing_plugin_message_get_description(msg)); } else { From 19527d79d636789a92dc2649d0059edf23a06c52 Mon Sep 17 00:00:00 2001 From: cudawarped <12133430+cudawarped@users.noreply.github.com> Date: Tue, 2 Jan 2024 08:33:55 +0200 Subject: [PATCH 14/57] core: address clang warnings --- modules/core/include/opencv2/core/cuda/common.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/core/include/opencv2/core/cuda/common.hpp b/modules/core/include/opencv2/core/cuda/common.hpp index 134809678d..1e1d5de1b0 100644 --- a/modules/core/include/opencv2/core/cuda/common.hpp +++ b/modules/core/include/opencv2/core/cuda/common.hpp @@ -99,7 +99,7 @@ namespace cv { namespace cuda } #if (CUDART_VERSION >= 12000) - template inline void createTextureObjectPitch2D(cudaTextureObject_t* tex, PtrStepSz& img, const cudaTextureDesc& texDesc) { + template inline void createTextureObjectPitch2D(cudaTextureObject_t*, PtrStepSz&, const cudaTextureDesc&) { CV_Error(cv::Error::GpuNotSupported, "Function removed in CUDA SDK 12"); } #else //TODO: remove from OpenCV 5.x From 14e0d4355011a3f89d9fd057ccd756bac95c27d8 Mon Sep 17 00:00:00 2001 From: Vincent Rabaud Date: Tue, 2 Jan 2024 22:18:35 +0100 Subject: [PATCH 15/57] Fix aruco detector thread safety. Concurrently writing to a vector is not thread-safe. --- modules/objdetect/src/aruco/aruco_detector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/objdetect/src/aruco/aruco_detector.cpp b/modules/objdetect/src/aruco/aruco_detector.cpp index 97a40746a8..031d4849b3 100644 --- a/modules/objdetect/src/aruco/aruco_detector.cpp +++ b/modules/objdetect/src/aruco/aruco_detector.cpp @@ -780,7 +780,7 @@ struct ArucoDetector::ArucoDetectorImpl { vector idsTmp(ncandidates, -1); vector rotated(ncandidates, 0); vector validCandidates(ncandidates, 0); - vector was(ncandidates, false); + vector was(ncandidates, false); bool checkCloseContours = true; int maxDepth = 0; From 81865b3d522576d38c49f8bca7729a7501e7b8b1 Mon Sep 17 00:00:00 2001 From: cudawarped <12133430+cudawarped@users.noreply.github.com> Date: Sun, 31 Dec 2023 09:12:11 +0200 Subject: [PATCH 16/57] videoio: fix incorrect timestamps returned by VideoCapture when using raw video parsing mode --- modules/videoio/src/cap_ffmpeg_impl.hpp | 7 ++- modules/videoio/test/test_ffmpeg.cpp | 60 +++++++++++++++++++++++++ modules/videoio/test/test_precomp.hpp | 1 + 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp index 3dcd4e81d5..9be4b08279 100644 --- a/modules/videoio/src/cap_ffmpeg_impl.hpp +++ b/modules/videoio/src/cap_ffmpeg_impl.hpp @@ -1587,8 +1587,11 @@ bool CvCapture_FFMPEG::grabFrame() if (picture_pts == AV_NOPTS_VALUE_) { if (!rawMode) picture_pts = picture->CV_FFMPEG_PTS_FIELD != AV_NOPTS_VALUE_ && picture->CV_FFMPEG_PTS_FIELD != 0 ? picture->CV_FFMPEG_PTS_FIELD : picture->pkt_dts; - else - picture_pts = packet.pts != AV_NOPTS_VALUE_ && packet.pts != 0 ? packet.pts : packet.dts; + else { + const AVPacket& packet_raw = packet.data != 0 ? packet : packet_filtered; + picture_pts = packet_raw.pts != AV_NOPTS_VALUE_ && packet_raw.pts != 0 ? packet_raw.pts : packet_raw.dts; + if (picture_pts < 0) picture_pts = 0; + } frame_number++; } } diff --git a/modules/videoio/test/test_ffmpeg.cpp b/modules/videoio/test/test_ffmpeg.cpp index 7e09d61729..88f0c8f4bd 100644 --- a/modules/videoio/test/test_ffmpeg.cpp +++ b/modules/videoio/test/test_ffmpeg.cpp @@ -235,6 +235,66 @@ const videoio_container_params_t videoio_container_params[] = INSTANTIATE_TEST_CASE_P(/**/, videoio_container, testing::ValuesIn(videoio_container_params)); +typedef tuple videoio_container_get_params_t; +typedef testing::TestWithParam videoio_container_get; + +TEST_P(videoio_container_get, read) +{ + const VideoCaptureAPIs api = get<0>(GetParam()); + + if (!videoio_registry::hasBackend(api)) + throw SkipTestException("Backend was not found"); + + const string fileName = get<1>(GetParam()); + const int height = get<2>(GetParam()); + const int width = get<3>(GetParam()); + const int nFrames = get<4>(GetParam()); + const int bitrate = get<5>(GetParam()); + const int fps = get<6>(GetParam()); + + VideoCapture container(findDataFile(fileName), api, { CAP_PROP_FORMAT, -1 }); + if (!container.isOpened()) + throw SkipTestException("Video stream is not supported"); + + const int heightProp = static_cast(container.get(CAP_PROP_FRAME_HEIGHT)); + ASSERT_EQ(height, heightProp); + const int widthProp = static_cast(container.get(CAP_PROP_FRAME_WIDTH)); + ASSERT_EQ(width, widthProp); + const int nFramesProp = static_cast(container.get(CAP_PROP_FRAME_COUNT)); + ASSERT_EQ(nFrames, nFramesProp); + const int bitrateProp = static_cast(container.get(CAP_PROP_BITRATE)); + ASSERT_EQ(bitrate, bitrateProp); + const double fpsProp = container.get(CAP_PROP_FPS); + ASSERT_EQ(fps, fpsProp); + // remove when PR fixing raw video CAP_PROP_POS_MSEC return value is merged and windows dll is updated +#ifndef _WIN32 + vector displayTimeMs; + int iFrame = 1; + while (container.grab()) { + displayTimeMs.push_back(static_cast(container.get(CAP_PROP_POS_MSEC))); + const int iFrameProp = static_cast(container.get(CAP_PROP_POS_FRAMES)); + ASSERT_EQ(iFrame++, iFrameProp); + } + sort(displayTimeMs.begin(), displayTimeMs.end()); + vector displayTimeDiffMs(displayTimeMs.size()); + std::adjacent_difference(displayTimeMs.begin(), displayTimeMs.end(), displayTimeDiffMs.begin()); + auto minTimeMsIt = min_element(displayTimeDiffMs.begin() + 1, displayTimeDiffMs.end()); + auto maxTimeMsIt = max_element(displayTimeDiffMs.begin() + 1, displayTimeDiffMs.end()); + const int frameTimeMs = static_cast(1000.0 / fps); + ASSERT_NEAR(frameTimeMs, *minTimeMsIt, 1); + ASSERT_NEAR(frameTimeMs, *maxTimeMsIt, 1); +#endif +} + +const videoio_container_get_params_t videoio_container_get_params[] = +{ + videoio_container_get_params_t(CAP_FFMPEG, "video/big_buck_bunny.mp4", 384, 672, 125, 483, 24), + videoio_container_get_params_t(CAP_FFMPEG, "video/big_buck_bunny.mjpg.avi", 384, 672, 125, 2713, 24), + videoio_container_get_params_t(CAP_FFMPEG, "video/sample_322x242_15frames.yuv420p.libx264.mp4", 242, 322, 15, 542, 25) +}; + +INSTANTIATE_TEST_CASE_P(/**/, videoio_container_get, testing::ValuesIn(videoio_container_get_params)); + typedef tuple videoio_encapsulate_params_t; typedef testing::TestWithParam< videoio_encapsulate_params_t > videoio_encapsulate; diff --git a/modules/videoio/test/test_precomp.hpp b/modules/videoio/test/test_precomp.hpp index 61ecec609d..835177729b 100644 --- a/modules/videoio/test/test_precomp.hpp +++ b/modules/videoio/test/test_precomp.hpp @@ -6,6 +6,7 @@ #include #include +#include #include "opencv2/ts.hpp" #include "opencv2/ts/ocl_test.hpp" From 63cde0b90d378337cdf280e7e3d129b38b030439 Mon Sep 17 00:00:00 2001 From: fengyuentau Date: Fri, 5 Jan 2024 17:24:09 +0800 Subject: [PATCH 17/57] multi-threaded scatter and refactor perf --- modules/dnn/perf/perf_layer.cpp | 123 +++++++++++------------ modules/dnn/src/layers/scatter_layer.cpp | 100 +++++++++--------- 2 files changed, 111 insertions(+), 112 deletions(-) diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp index 66b5ad62c2..04e7d04153 100644 --- a/modules/dnn/perf/perf_layer.cpp +++ b/modules/dnn/perf/perf_layer.cpp @@ -258,76 +258,71 @@ PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16) test_slice<4>(inputShape, begin, end); } -struct Layer_Scatter : public TestBaseWithParam > -{ - void test_layer(const std::vector& shape, const String reduction = "none", int axis = 0) +using Layer_Scatter = TestBaseWithParam, std::string, int, tuple>>; +PERF_TEST_P_(Layer_Scatter, scatter) { + std::vector shape = get<0>(GetParam()); + std::string reduction = get<1>(GetParam()); + int axis = get<2>(GetParam()); + int backend_id = get<0>(get<3>(GetParam())); + int target_id = get<1>(get<3>(GetParam())); + + Mat data(shape, CV_32FC1); + Mat indices(shape, CV_32FC1); + Mat updates(shape, CV_32FC1); + + randn(data, 0.f, 1.f); + randu(indices, 0, shape[axis]); + randn(updates, 0.f, 1.f); + + indices.convertTo(indices, CV_32SC1, 1, -1); + + Net net; + LayerParams lp; + lp.type = "Scatter"; + lp.name = "testLayer"; + lp.set("reduction", reduction); + lp.set("axis", axis); + + int id = net.addLayerToPrev(lp.name, lp.type, lp); + net.connect(0, 0, id, 0); + net.connect(0, 1, id, 1); + net.connect(0, 2, id, 2); + + // warmup { - int backendId = get<0>(GetParam()); - int targetId = get<1>(GetParam()); + std::vector input_names{"data", "indices", "updates"}; + net.setInputsNames(input_names); + net.setInput(data, input_names[0]); + net.setInput(indices, input_names[1]); + net.setInput(updates, input_names[2]); - Mat data(shape, CV_32FC1); - Mat indices(shape, CV_32FC1); - Mat updates(shape, CV_32FC1); - - Scalar mean = 0.f; - Scalar std = 1.f; - randn(data, mean, std); - randu(indices, 0, shape[axis]); - randn(updates, mean, std); - - indices.convertTo(indices, CV_32SC1, 1, -1); - - Net net; - LayerParams lp; - lp.type = "Scatter"; - lp.name = "testLayer"; - lp.set("reduction", reduction); - lp.set("axis", axis); - - int id = net.addLayerToPrev(lp.name, lp.type, lp); - net.connect(0, 0, id, 0); - net.connect(0, 1, id, 1); - net.connect(0, 2, id, 2); - - // warmup - { - std::vector inpNames(3); - inpNames[0] = "data"; - inpNames[1] = "indices"; - inpNames[2] = "updates"; - net.setInputsNames(inpNames); - net.setInput(data, inpNames[0]); - net.setInput(indices, inpNames[1]); - net.setInput(updates, inpNames[2]); - - net.setPreferableBackend(backendId); - net.setPreferableTarget(targetId); - Mat out = net.forward(); - } - - TEST_CYCLE() - { - Mat res = net.forward(); - } - - SANITY_CHECK_NOTHING(); + net.setPreferableBackend(backend_id); + net.setPreferableTarget(target_id); + Mat out = net.forward(); } - int N = 8; - int C = 256; - int H = 128; - int W = 100; -}; + // perf + TEST_CYCLE() + { + Mat res = net.forward(); + } -PERF_TEST_P_(Layer_Scatter, DISABLED_Scatter) -{ - test_layer({N, C, H, W}); + SANITY_CHECK_NOTHING(); } -PERF_TEST_P_(Layer_Scatter, DISABLED_Scatter_add) -{ - test_layer({N, C, H, W}, "add"); -} +INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, Combine( + Values(std::vector{2, 128, 64, 50}), + Values(std::string("none"), std::string("add")), + Values(0), // use Values(0, 1, 2, 3) for more details + dnnBackendsAndTargets(/* withInferenceEngine= */ false, + /* withHalide= */ false, + /* withCpuOCV= */ true, + /* withVkCom= */ false, + /* withCUDA= */ false, + /* withNgraph= */ false, + /* withWebnn= */ false, + /* withCann= */ false) // only test on CPU +)); struct Layer_ScatterND : public TestBaseWithParam > { @@ -800,7 +795,7 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple #ifdef HAVE_CUDA INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA))); #endif -INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); +// INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); diff --git a/modules/dnn/src/layers/scatter_layer.cpp b/modules/dnn/src/layers/scatter_layer.cpp index 084eecb03c..3e0ee2affb 100644 --- a/modules/dnn/src/layers/scatter_layer.cpp +++ b/modules/dnn/src/layers/scatter_layer.cpp @@ -81,59 +81,63 @@ public: } template - void forward_impl(const Functor& rd, const Mat& data, const Mat& indices, const Mat& updates, Mat& out) + void forward_impl(const Functor& reduce_operation, const Mat& input_mat, const Mat& indices_mat, const Mat& updates_mat, Mat& output_mat) { - data.copyTo(out); + input_mat.copyTo(output_mat); - const int ndims = data.dims; - const int* shape = data.size.p; - const size_t* step = data.step.p; + const int ndims = input_mat.dims; + const auto &input_mat_shape = shape(input_mat); + std::vector input_mat_step(ndims); - const int* ind_shape = indices.size.p; - const size_t* ind_step = indices.step.p; + const auto &indices_mat_shape = shape(indices_mat); + // const auto &indices_mat_step = indices_mat.step; + std::vector indices_mat_step(ndims); - size_t inp_offset = 0; - size_t ind_offset = 0; - const T* p_index = indices.ptr(); - const T* p_update = updates.ptr(); - T* p_out = out.ptr(); - - size_t total = indices.total(); - - int j, offset_at_idx, index; - size_t t, idx; - for (size_t i = 0; i < total; i++) - { - t = i; - inp_offset = 0; - ind_offset = 0; - int offset_at_axis = 0; - for (j = ndims - 1; j >= 0; j--) - { - idx = t / ind_shape[j]; - offset_at_idx = (int)(t - idx * ind_shape[j]); - ind_offset += offset_at_idx * ind_step[j]; - inp_offset += offset_at_idx * step[j]; - t = idx; - if (j == axis) - { - offset_at_axis = offset_at_idx * step[j]; - } - } - ind_offset /= sizeof(T); - - // get index and overwrite current indices - const T* tmp_p_index = p_index + ind_offset; - index = (int)(*tmp_p_index); - CV_Assert(index < shape[axis] && index > -shape[axis]); - - inp_offset = inp_offset - offset_at_axis + ((index + shape[axis]) % shape[axis]) * step[axis]; - inp_offset /= sizeof(T); - - const T* tmp_p_update = p_update + ind_offset; - T* tmp_p_out = p_out + inp_offset; - *tmp_p_out = rd(*tmp_p_out, *tmp_p_update); + for (int i = 0; i < ndims; i++) { + input_mat_step[i] = static_cast(input_mat.step.p[i] / sizeof(T)); + indices_mat_step[i] = static_cast(indices_mat.step.p[i] / sizeof(T)); } + + const T* indices = indices_mat.ptr(); + const T* updates = updates_mat.ptr(); + T* output = output_mat.ptr(); + + auto fn = [&](const Range &r) { + size_t input_offset = 0, indices_offset = 0; + + int indices_index, index; + size_t axis_offset, tmp_index, j_index; + for (int i = r.start; i < r.end; i++) { + input_offset = 0; + indices_offset = 0; + indices_index = i; + axis_offset = 0; + for (int j = ndims - 1; j >= 0; j--) { + tmp_index = indices_index / indices_mat_shape[j]; + j_index = (size_t)(indices_index - tmp_index * indices_mat_shape[j]); + input_offset += j_index * input_mat_step[j]; + indices_offset += j_index * indices_mat_step[j]; + indices_index = tmp_index; + if (j == axis) { + axis_offset = j_index * input_mat_step[j]; + } + } + + // get index and overwrite current indices + index = static_cast(*(indices + indices_offset)); + index = (index + input_mat_shape[axis]) % input_mat_shape[axis]; + CV_Assert(index < input_mat_shape[axis] && index >= 0); + input_offset = input_offset - axis_offset + index * input_mat_step[axis]; + + const T* update = updates + indices_offset; + T* y = output + input_offset; + *y = reduce_operation(*y, *update); + } + }; + + size_t total = indices_mat.total(); + double nstripes = (size_t)total * ndims * (1 / 1024.0); + parallel_for_(Range(0, total), fn, nstripes); } template From 2997b4c5fe0c00493d61530dae22280b58390d4a Mon Sep 17 00:00:00 2001 From: fengyuentau Date: Fri, 5 Jan 2024 18:15:27 +0800 Subject: [PATCH 18/57] pretty format --- modules/dnn/src/layers/scatter_layer.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/modules/dnn/src/layers/scatter_layer.cpp b/modules/dnn/src/layers/scatter_layer.cpp index 3e0ee2affb..3b803b16c1 100644 --- a/modules/dnn/src/layers/scatter_layer.cpp +++ b/modules/dnn/src/layers/scatter_layer.cpp @@ -81,16 +81,15 @@ public: } template - void forward_impl(const Functor& reduce_operation, const Mat& input_mat, const Mat& indices_mat, const Mat& updates_mat, Mat& output_mat) - { + void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat &output_mat) { input_mat.copyTo(output_mat); const int ndims = input_mat.dims; + const auto &input_mat_shape = shape(input_mat); std::vector input_mat_step(ndims); const auto &indices_mat_shape = shape(indices_mat); - // const auto &indices_mat_step = indices_mat.step; std::vector indices_mat_step(ndims); for (int i = 0; i < ndims; i++) { @@ -98,16 +97,16 @@ public: indices_mat_step[i] = static_cast(indices_mat.step.p[i] / sizeof(T)); } - const T* indices = indices_mat.ptr(); - const T* updates = updates_mat.ptr(); - T* output = output_mat.ptr(); - auto fn = [&](const Range &r) { size_t input_offset = 0, indices_offset = 0; int indices_index, index; size_t axis_offset, tmp_index, j_index; for (int i = r.start; i < r.end; i++) { + const T* indices = indices_mat.ptr(); + const T* updates = updates_mat.ptr(); + T* output = output_mat.ptr(); + input_offset = 0; indices_offset = 0; indices_index = i; @@ -129,9 +128,9 @@ public: CV_Assert(index < input_mat_shape[axis] && index >= 0); input_offset = input_offset - axis_offset + index * input_mat_step[axis]; - const T* update = updates + indices_offset; - T* y = output + input_offset; - *y = reduce_operation(*y, *update); + updates += indices_offset; + output += input_offset; + *output = reduce_operation(*output, *updates); } }; From 2ed97b9ef3f096031b55e44dff254533dc14afa2 Mon Sep 17 00:00:00 2001 From: fengyuentau Date: Fri, 5 Jan 2024 18:15:59 +0800 Subject: [PATCH 19/57] multi-threaded scatterND and refactor perf --- modules/dnn/perf/perf_layer.cpp | 168 ++++++++++----------- modules/dnn/src/layers/scatterND_layer.cpp | 78 +++++----- 2 files changed, 123 insertions(+), 123 deletions(-) diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp index 04e7d04153..3e477da125 100644 --- a/modules/dnn/perf/perf_layer.cpp +++ b/modules/dnn/perf/perf_layer.cpp @@ -324,103 +324,95 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, Combine( /* withCann= */ false) // only test on CPU )); -struct Layer_ScatterND : public TestBaseWithParam > -{ - void test_layer(const std::vector& shape, const String reduction = "none") +using Layer_ScatterND = TestBaseWithParam, std::string, tuple>>; +PERF_TEST_P_(Layer_ScatterND, scatterND) { + std::vector shape = get<0>(GetParam()); + std::string reduction = get<1>(GetParam()); + int backend_id = get<0>(get<2>(GetParam())); + int target_id = get<1>(get<2>(GetParam())); + + std::vector indices_shape(shape); + indices_shape.push_back(int(shape.size())); + Mat data(shape, CV_32FC1); + Mat indices(indices_shape, CV_32FC1); + Mat updates(shape, CV_32FC1); + + randn(data, 0.f, 1.f); + randn(updates, 0.f, 1.f); + + // initialize the indices with index tuples like [0...N, 0...C, 0...H, 0...W] + std::vector current_index_tuple(shape.size()); + int total = data.total(); + std::vector indices_step; + for (int i = 0; i < indices.dims; i++) { - int backendId = get<0>(GetParam()); - int targetId = get<1>(GetParam()); - - std::vector indices_shape(shape); - indices_shape.push_back(int(shape.size())); - Mat data(shape, CV_32FC1); - Mat indices(indices_shape, CV_32FC1); - Mat updates(shape, CV_32FC1); - - Scalar mean = 0.f; - Scalar std = 1.f; - randn(data, mean, std); - randn(updates, mean, std); - - // initialize the indices with index tuples like [0...N, 0...C, 0...H, 0...W] - std::vector current_index_tuple(shape.size()); - int total = data.total(); - std::vector indices_step; - for (int i = 0; i < indices.dims; i++) + int step = indices.step.p[i] / sizeof(float); + indices_step.push_back(step); + } + int t, j, idx, offset_at_idx, offset; + for (int i = 0; i < total; i++) + { + t = i; + for (j = shape.size() - 1; j >= 0; j--) { - int step = indices.step.p[i] / sizeof(float); - indices_step.push_back(step); - } - int t, j, idx, offset_at_idx, offset; - for (int i = 0; i < total; i++) - { - t = i; - for (j = shape.size() - 1; j >= 0; j--) - { - idx = t / shape[j]; - offset_at_idx = (int)(t - idx * shape[j]); - current_index_tuple[j] = offset_at_idx; - t = idx; - } - - offset = 0; - for (j = 0; j < shape.size(); j++) - offset += current_index_tuple[j] * indices_step[j]; - - for (j = 0; j < shape.size(); j++) - indices.at(offset + j) = current_index_tuple[j]; + idx = t / shape[j]; + offset_at_idx = (int)(t - idx * shape[j]); + current_index_tuple[j] = offset_at_idx; + t = idx; } - Net net; - LayerParams lp; - lp.type = "ScatterND"; - lp.name = "testLayer"; - lp.set("reduction", reduction); + offset = 0; + for (j = 0; j < shape.size(); j++) + offset += current_index_tuple[j] * indices_step[j]; - int id = net.addLayerToPrev(lp.name, lp.type, lp); - net.connect(0, 0, id, 0); - net.connect(0, 1, id, 1); - net.connect(0, 2, id, 2); - - // warmup - { - std::vector inpNames(3); - inpNames[0] = "data"; - inpNames[1] = "indices"; - inpNames[2] = "updates"; - net.setInputsNames(inpNames); - net.setInput(data, inpNames[0]); - net.setInput(indices, inpNames[1]); - net.setInput(updates, inpNames[2]); - - net.setPreferableBackend(backendId); - net.setPreferableTarget(targetId); - Mat out = net.forward(); - } - - TEST_CYCLE() - { - Mat res = net.forward(); - } - - SANITY_CHECK_NOTHING(); + for (j = 0; j < shape.size(); j++) + indices.at(offset + j) = current_index_tuple[j]; } - int N = 8; - int C = 256; - int H = 128; - int W = 100; -}; + Net net; + LayerParams lp; + lp.type = "ScatterND"; + lp.name = "testLayer"; + lp.set("reduction", reduction); -PERF_TEST_P_(Layer_ScatterND, DISABLED_ScatterND) -{ - test_layer({N, C, H ,W}); + int id = net.addLayerToPrev(lp.name, lp.type, lp); + net.connect(0, 0, id, 0); + net.connect(0, 1, id, 1); + net.connect(0, 2, id, 2); + + // warmup + { + std::vector input_names{"data", "indices", "updates"}; + net.setInputsNames(input_names); + net.setInput(data, input_names[0]); + net.setInput(indices, input_names[1]); + net.setInput(updates, input_names[2]); + + net.setPreferableBackend(backend_id); + net.setPreferableTarget(target_id); + Mat out = net.forward(); + } + + TEST_CYCLE() + { + Mat res = net.forward(); + } + + SANITY_CHECK_NOTHING(); } -PERF_TEST_P_(Layer_ScatterND, DISABLED_ScatterND_add) -{ - test_layer({N, C, H , W}, "add"); -} +INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, Combine( + Values(std::vector{2, 128, 64, 50}), + Values(std::string("none"), std::string("add")), + dnnBackendsAndTargets(/* withInferenceEngine= */ false, + /* withHalide= */ false, + /* withCpuOCV= */ true, + /* withVkCom= */ false, + /* withCUDA= */ false, + /* withNgraph= */ false, + /* withWebnn= */ false, + /* withCann= */ false) // only test on CPU +)); struct Layer_LayerNorm : public TestBaseWithParam > { @@ -795,8 +787,6 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple #ifdef HAVE_CUDA INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA))); #endif -// INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); -INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); diff --git a/modules/dnn/src/layers/scatterND_layer.cpp b/modules/dnn/src/layers/scatterND_layer.cpp index 648d35fc0c..531d32f45b 100644 --- a/modules/dnn/src/layers/scatterND_layer.cpp +++ b/modules/dnn/src/layers/scatterND_layer.cpp @@ -89,49 +89,59 @@ public: // NOTE: This impl does not check whether indices have duplicate entries. // The last duplicate entry will overwrite the previous. template - void forward_impl(const Functor& rd, const Mat& data, const Mat& indices, const Mat& updates, Mat& out) - { - data.copyTo(out); + void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat& output_mat) { + input_mat.copyTo(output_mat); - const int* shape = data.size.p; - const size_t* step = data.step.p; + const auto &input_mat_shape = shape(input_mat); + std::vector input_mat_step(input_mat_shape.size()); + for (int i = 0; i < input_mat.dims; i++) { + input_mat_step[i] = static_cast(input_mat.step.p[i] / sizeof(T)); + } - const int ind_ndims = indices.dims; - const int* ind_shape = indices.size.p; - const T* p_indices = indices.ptr(); + const int indices_mat_ndims = indices_mat.dims; + const auto &indices_mat_shape = shape(indices_mat); - const int upd_ndims = updates.dims; - const int* upd_shape = updates.size.p; - const T* p_updates = updates.ptr(); + const int updates_mat_ndims = updates_mat.dims; + const auto &updates_mat_shape = shape(updates_mat); - T* p_out = out.ptr(); - - int k = ind_shape[ind_ndims - 1]; // last dim of indices - size_t total = (size_t)(indices.total() / k); + int indices_last_dim = indices_mat_shape[indices_mat_ndims - 1]; // last dim of indices size_t updates_size = 1; - for (int i = ind_ndims - 1; i < upd_ndims; i++) - updates_size *= upd_shape[i]; + for (int i = indices_mat_ndims - 1; i < updates_mat_ndims; i++) + updates_size *= updates_mat_shape[i]; - size_t inp_start_offset = 0; - size_t ind_start_offset = 0; - size_t upd_start_offset = 0; - for (size_t i = 0; i < total; i++, ind_start_offset += k, upd_start_offset += updates_size) - { - const T* tmp_p_indices = p_indices + ind_start_offset; - inp_start_offset = 0; - for (int j = 0; j < k; j++) - { - CV_Assert(tmp_p_indices[j] < shape[j] && tmp_p_indices[j] > -shape[j]); - inp_start_offset += (((int)tmp_p_indices[j] + shape[j]) % shape[j]) * step[j]; + auto fn = [&](const Range &r) { + size_t input_offset = 0, + indices_offset = r.start * indices_last_dim, + updates_offset = r.start * updates_size; + for (int i = r.start; i < r.end; i++) { + const T* indices = indices_mat.ptr(); + const T* updates = updates_mat.ptr(); + T* output = output_mat.ptr(); + + input_offset = 0; + indices += indices_offset; + for (int j = 0; j < indices_last_dim; j++) { + int index = static_cast(*(indices + j)); + index = (index + input_mat_shape[j]) % input_mat_shape[j]; + CV_Assert(index < input_mat_shape[j] && index >= 0); + input_offset += index * input_mat_step[j]; + } + + updates += updates_offset; + output += input_offset; + for (int j = 0; j < updates_size; j++) { + output[j] = reduce_operation(output[j], updates[j]); + } + + indices_offset += indices_last_dim; + updates_offset += updates_size; } - inp_start_offset /= sizeof(T); + }; - const T* tmp_p_updates = p_updates + upd_start_offset; - T* tmp_p_out = p_out + inp_start_offset; - for (int j = 0; j < updates_size; j++) - tmp_p_out[j] = rd(tmp_p_out[j], tmp_p_updates[j]); - } + size_t total = (size_t)(indices_mat.total() / indices_last_dim); + double nstripes = (size_t)total * (indices_last_dim + updates_size) * (1 / 1024.0); + parallel_for_(Range(0, total), fn, nstripes); } template From b7d70613e431a5f8875001bc590d72d710429954 Mon Sep 17 00:00:00 2001 From: Yuantao Feng Date: Fri, 5 Jan 2024 18:33:01 +0000 Subject: [PATCH 20/57] fix failed assertion in debug build --- modules/dnn/perf/perf_layer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp index 3e477da125..ad3bd4c223 100644 --- a/modules/dnn/perf/perf_layer.cpp +++ b/modules/dnn/perf/perf_layer.cpp @@ -350,6 +350,7 @@ PERF_TEST_P_(Layer_ScatterND, scatterND) { indices_step.push_back(step); } int t, j, idx, offset_at_idx, offset; + auto *indices_ptr = indices.ptr(); for (int i = 0; i < total; i++) { t = i; @@ -366,7 +367,7 @@ PERF_TEST_P_(Layer_ScatterND, scatterND) { offset += current_index_tuple[j] * indices_step[j]; for (j = 0; j < shape.size(); j++) - indices.at(offset + j) = current_index_tuple[j]; + indices_ptr[offset + j] = current_index_tuple[j]; } Net net; From 34a871c8550ab3e15976aee5baf47913a6c6bae4 Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Sat, 6 Jan 2024 01:41:02 -0500 Subject: [PATCH 21/57] Fix building on OpenBSD X86 --- modules/core/src/system.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 9f67d92a43..4cba6eb2d2 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -251,7 +251,7 @@ std::wstring GetTempFileNameWinRT(std::wstring prefix) #include "omp.h" #endif -#if defined __unix__ || defined __APPLE__ || defined __EMSCRIPTEN__ || defined __FreeBSD__ || defined __GLIBC__ || defined __HAIKU__ +#if defined __unix__ || defined __APPLE__ || defined __EMSCRIPTEN__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __GLIBC__ || defined __HAIKU__ #include #include #include From 3b287770b984cce7c87211c04b86d0c2cfd2b1bf Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Sat, 6 Jan 2024 19:35:25 -0500 Subject: [PATCH 22/57] Corrections for FreeBSD ARM support FreeBSD does not have the /proc file system. FreeBSD was added to the code path for aarch64 before the use of the /proc file system with f7b4b750d8930b5bb6696cea6d609dc70a0597db but then /proc usage was added not long after with b3269b08a19d1da49cf63754d92bdbd39e22c568 --- modules/core/src/system.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 4cba6eb2d2..c61fd67a19 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -562,7 +562,7 @@ struct HWFeatures } #endif // CV_CPUID_X86 - #if defined __ANDROID__ || defined __linux__ || defined __FreeBSD__ || defined __QNX__ + #if defined __ANDROID__ || defined __linux__ || defined __QNX__ #ifdef __aarch64__ have[CV_CPU_NEON] = true; have[CV_CPU_FP16] = true; @@ -611,7 +611,7 @@ struct HWFeatures CV_LOG_INFO(NULL, "- FP16 instructions is NOT enabled via build flags"); #endif #endif - #elif defined __arm__ && !defined __FreeBSD__ + #elif defined __arm__ int cpufile = open("/proc/self/auxv", O_RDONLY); if (cpufile >= 0) From a86e9f161d2ff140de4f636370b8fc5d184d9609 Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Sun, 7 Jan 2024 01:26:24 -0500 Subject: [PATCH 23/57] Fix building on OpenBSD FAILED: lib/libopencv_core.so.13.0 ... ld: error: undefined symbol: __cxa_atexit Do not try to use --no-undefined on OpenBSD. OpenBSD does not link shared libraries with libc and thus linkage with --no-undefined is expected to fail. --- cmake/OpenCVCompilerOptions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 427189c079..af38484f5b 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -391,7 +391,7 @@ endif() # Apply "-Wl,--no-undefined" linker flags: https://github.com/opencv/opencv/pull/21347 if(NOT OPENCV_SKIP_LINK_NO_UNDEFINED) - if(UNIX AND (NOT APPLE OR NOT CMAKE_VERSION VERSION_LESS "3.2")) + if(UNIX AND ((NOT APPLE OR NOT CMAKE_VERSION VERSION_LESS "3.2") AND NOT CMAKE_SYSTEM_NAME MATCHES "OpenBSD")) set(_option "-Wl,--no-undefined") set(_saved_CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${_option}") # requires CMake 3.2+ and CMP0056 From 48fd23a02ae1d63ff7f83e4123340fbe27dd3949 Mon Sep 17 00:00:00 2001 From: shenleban tongying Date: Sun, 7 Jan 2024 06:32:29 -0500 Subject: [PATCH 24/57] fix highgui qt's statusbar text got cropped --- modules/highgui/src/window_QT.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp index e764b8544c..c72163cd6b 100644 --- a/modules/highgui/src/window_QT.cpp +++ b/modules/highgui/src/window_QT.cpp @@ -1725,14 +1725,14 @@ CvWindow::CvWindow(QString name, int arg2) //Now attach everything if (myToolBar) - myGlobalLayout->addWidget(myToolBar, Qt::AlignCenter); + myGlobalLayout->addWidget(myToolBar, 0, Qt::AlignLeft); - myGlobalLayout->addWidget(myView->getWidget(), Qt::AlignCenter); + myGlobalLayout->addWidget(myView->getWidget(), 0, Qt::AlignCenter); - myGlobalLayout->addLayout(myBarLayout, Qt::AlignCenter); + myGlobalLayout->addLayout(myBarLayout); if (myStatusBar) - myGlobalLayout->addWidget(myStatusBar, Qt::AlignCenter); + myGlobalLayout->addWidget(myStatusBar, 0, Qt::AlignLeft); setLayout(myGlobalLayout); show(); @@ -2142,7 +2142,6 @@ void CvWindow::createStatusBar() { myStatusBar = new QStatusBar(this); myStatusBar->setSizeGripEnabled(false); - myStatusBar->setFixedHeight(20); myStatusBar->setMinimumWidth(1); myStatusBar_msg = new QLabel; From 8ade7f6177285b684346258bf98d128efa107519 Mon Sep 17 00:00:00 2001 From: Alexander Lyulkov Date: Sun, 7 Jan 2024 20:55:57 +0300 Subject: [PATCH 25/57] Added any screen orientation support for JavaCameraView --- .../org/opencv/android/JavaCameraView.java | 126 ++++++++++++++---- .../face-detection/gradle/AndroidManifest.xml | 1 - .../facedetect/FaceDetectActivity.java | 5 +- .../gradle/AndroidManifest.xml | 1 - 4 files changed, 101 insertions(+), 32 deletions(-) diff --git a/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java b/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java index a7c72e43f0..1c10c3cb12 100644 --- a/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java +++ b/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java @@ -10,9 +10,12 @@ import android.hardware.Camera.PreviewCallback; import android.os.Build; import android.util.AttributeSet; import android.util.Log; +import android.view.Surface; import android.view.ViewGroup.LayoutParams; +import android.view.WindowManager; import org.opencv.BuildConfig; +import org.opencv.core.Core; import org.opencv.core.CvType; import org.opencv.core.Mat; import org.opencv.core.Size; @@ -71,28 +74,20 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb boolean result = true; synchronized (this) { mCamera = null; + int cameraId = -1; if (mCameraIndex == CAMERA_ID_ANY) { - Log.d(TAG, "Trying to open camera with old open()"); - try { - mCamera = Camera.open(); - } - catch (Exception e){ - Log.e(TAG, "Camera is not available (in use or does not exist): " + e.getLocalizedMessage()); - } - - if(mCamera == null && Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD) { - boolean connected = false; - for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) { - Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(camIdx) + ")"); - try { - mCamera = Camera.open(camIdx); - connected = true; - } catch (RuntimeException e) { - Log.e(TAG, "Camera #" + camIdx + "failed to open: " + e.getLocalizedMessage()); - } - if (connected) break; + boolean connected = false; + for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) { + Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(camIdx) + ")"); + try { + mCamera = Camera.open(camIdx); + connected = true; + cameraId = camIdx; + } catch (RuntimeException e) { + Log.e(TAG, "Camera #" + camIdx + "failed to open: " + e.getLocalizedMessage()); } + if (connected) break; } } else { if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD) { @@ -126,6 +121,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(localCameraIndex) + ")"); try { mCamera = Camera.open(localCameraIndex); + cameraId = localCameraIndex; } catch (RuntimeException e) { Log.e(TAG, "Camera #" + localCameraIndex + "failed to open: " + e.getLocalizedMessage()); } @@ -136,6 +132,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb if (mCamera == null) return false; + int frameRotation = getFrameRotation(cameraId); /* Now set camera parameters */ try { Camera.Parameters params = mCamera.getParameters(); @@ -176,8 +173,16 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb mCamera.setParameters(params); params = mCamera.getParameters(); - mFrameWidth = params.getPreviewSize().width; - mFrameHeight = params.getPreviewSize().height; + int rawFrameWidth = params.getPreviewSize().width; + int rawFrameHeight = params.getPreviewSize().height; + + if (frameRotation % 180 == 0) { + mFrameWidth = params.getPreviewSize().width; + mFrameHeight = params.getPreviewSize().height; + } else { + mFrameWidth = params.getPreviewSize().height; + mFrameHeight = params.getPreviewSize().width; + } if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT)) mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth); @@ -196,14 +201,14 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb mCamera.setPreviewCallbackWithBuffer(this); mFrameChain = new Mat[2]; - mFrameChain[0] = new Mat(mFrameHeight + (mFrameHeight/2), mFrameWidth, CvType.CV_8UC1); - mFrameChain[1] = new Mat(mFrameHeight + (mFrameHeight/2), mFrameWidth, CvType.CV_8UC1); + mFrameChain[0] = new Mat(rawFrameHeight + (rawFrameHeight/2), rawFrameWidth, CvType.CV_8UC1); + mFrameChain[1] = new Mat(rawFrameHeight + (rawFrameHeight/2), rawFrameWidth, CvType.CV_8UC1); AllocateCache(); mCameraFrame = new JavaCameraFrame[2]; - mCameraFrame[0] = new JavaCameraFrame(mFrameChain[0], mFrameWidth, mFrameHeight); - mCameraFrame[1] = new JavaCameraFrame(mFrameChain[1], mFrameWidth, mFrameHeight); + mCameraFrame[0] = new JavaCameraFrame(mFrameChain[0], rawFrameWidth, rawFrameHeight, frameRotation); + mCameraFrame[1] = new JavaCameraFrame(mFrameChain[1], rawFrameWidth, rawFrameHeight, frameRotation); if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.HONEYCOMB) { mSurfaceTexture = new SurfaceTexture(MAGIC_TEXTURE_ID); @@ -313,7 +318,14 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb private class JavaCameraFrame implements CvCameraViewFrame { @Override public Mat gray() { - return mYuvFrameData.submat(0, mHeight, 0, mWidth); + mGray = mYuvFrameData.submat(0, mHeight, 0, mWidth); + + if (mRotation != 0) { + Core.rotate(mGray, mGrayRotated, getCvRotationCode(mRotation)); + return mGrayRotated; + } else { + return mGray; + } } @Override @@ -325,15 +337,33 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb else throw new IllegalArgumentException("Preview Format can be NV21 or YV12"); - return mRgba; + if (mRotation != 0) { + Core.rotate(mRgba, mRgbaRotated, getCvRotationCode(mRotation)); + return mRgbaRotated; + } else { + return mRgba; + } } - public JavaCameraFrame(Mat Yuv420sp, int width, int height) { + private int getCvRotationCode(int degrees) { + if (degrees == 90) { + return Core.ROTATE_90_CLOCKWISE; + } else if (degrees == 180) { + return Core.ROTATE_180; + } else { + return Core.ROTATE_90_COUNTERCLOCKWISE; + } + } + + public JavaCameraFrame(Mat Yuv420sp, int width, int height, int rotation) { super(); mWidth = width; mHeight = height; mYuvFrameData = Yuv420sp; mRgba = new Mat(); + mRgbaRotated = new Mat(); + mGrayRotated = new Mat(); + mRotation = rotation; } public void release() { @@ -342,10 +372,50 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb private Mat mYuvFrameData; private Mat mRgba; + private Mat mRgbaRotated; + private Mat mGray; + private Mat mGrayRotated; private int mWidth; private int mHeight; + private int mRotation; }; + /** + * Calculates how to rotate camera frame to match current screen orientation + */ + private int getFrameRotation(int cameraId) { + WindowManager windowManager = (WindowManager) getContext().getSystemService(Context.WINDOW_SERVICE); + int screenOrientation = windowManager.getDefaultDisplay().getRotation(); + int screenRotation = 0; + switch (screenOrientation) { + case Surface.ROTATION_0: + screenRotation = 0; + break; + case Surface.ROTATION_90: + screenRotation = 90; + break; + case Surface.ROTATION_180: + screenRotation = 180; + break; + case Surface.ROTATION_270: + screenRotation = 270; + break; + } + + android.hardware.Camera.CameraInfo info = new android.hardware.Camera.CameraInfo(); + android.hardware.Camera.getCameraInfo(cameraId, info); + + int frameRotation; + if (info.facing == Camera.CameraInfo.CAMERA_FACING_FRONT) { + frameRotation = (info.orientation + screenRotation) % 360; + frameRotation = (360 - frameRotation) % 360; + } else { + frameRotation = (info.orientation - screenRotation + 360) % 360; + } + + return frameRotation; + } + private class CameraWorker implements Runnable { @Override diff --git a/samples/android/face-detection/gradle/AndroidManifest.xml b/samples/android/face-detection/gradle/AndroidManifest.xml index 5476bcfbfb..f018df2eec 100644 --- a/samples/android/face-detection/gradle/AndroidManifest.xml +++ b/samples/android/face-detection/gradle/AndroidManifest.xml @@ -11,7 +11,6 @@ android:exported="true" android:name="FaceDetectActivity" android:label="@string/app_name" - android:screenOrientation="landscape" android:configChanges="keyboardHidden|orientation"> diff --git a/samples/android/face-detection/src/org/opencv/samples/facedetect/FaceDetectActivity.java b/samples/android/face-detection/src/org/opencv/samples/facedetect/FaceDetectActivity.java index f487b184ab..1ba50aec87 100644 --- a/samples/android/face-detection/src/org/opencv/samples/facedetect/FaceDetectActivity.java +++ b/samples/android/face-detection/src/org/opencv/samples/facedetect/FaceDetectActivity.java @@ -184,8 +184,9 @@ public class FaceDetectActivity extends CameraActivity implements CvCameraViewLi mRgba = inputFrame.rgba(); - if (mInputSize == null) { - mInputSize = new Size(Math.round(mRgba.cols()/mScale), Math.round(mRgba.rows()/mScale)); + Size inputSize = new Size(Math.round(mRgba.cols()/mScale), Math.round(mRgba.rows()/mScale)); + if (mInputSize == null || !mInputSize.equals(inputSize)) { + mInputSize = inputSize; mFaceDetector.setInputSize(mInputSize); } diff --git a/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml b/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml index 98f2a2f35b..7f543d2ec7 100644 --- a/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml +++ b/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml @@ -12,7 +12,6 @@ android:exported="true" android:name="Tutorial1Activity" android:label="@string/app_name" - android:screenOrientation="landscape" android:configChanges="keyboardHidden|orientation"> From 13127365e257ad4ca7b05d2367b5a7d58b256ed3 Mon Sep 17 00:00:00 2001 From: fengyuentau Date: Mon, 8 Jan 2024 11:55:06 +0800 Subject: [PATCH 26/57] better comment --- modules/dnn/perf/perf_layer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp index ad3bd4c223..94a0a6b249 100644 --- a/modules/dnn/perf/perf_layer.cpp +++ b/modules/dnn/perf/perf_layer.cpp @@ -340,7 +340,7 @@ PERF_TEST_P_(Layer_ScatterND, scatterND) { randn(data, 0.f, 1.f); randn(updates, 0.f, 1.f); - // initialize the indices with index tuples like [0...N, 0...C, 0...H, 0...W] + // Create indices such that indices[n_i, c_j, h_k, w_l, :4] = [i, j, k, l] std::vector current_index_tuple(shape.size()); int total = data.total(); std::vector indices_step; From bb402374986bffd37f9b427addb5aab1845a3ab6 Mon Sep 17 00:00:00 2001 From: Vincent Rabaud Date: Mon, 8 Jan 2024 14:57:57 +0100 Subject: [PATCH 27/57] Make aruco detector deterministic. --- modules/objdetect/src/aruco/aruco_detector.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/objdetect/src/aruco/aruco_detector.cpp b/modules/objdetect/src/aruco/aruco_detector.cpp index 031d4849b3..d5ddae8f1f 100644 --- a/modules/objdetect/src/aruco/aruco_detector.cpp +++ b/modules/objdetect/src/aruco/aruco_detector.cpp @@ -684,7 +684,7 @@ struct ArucoDetector::ArucoDetectorImpl { contours.clear(); // sort candidates from big to small - std::sort(candidateTree.begin(), candidateTree.end()); + std::stable_sort(candidateTree.begin(), candidateTree.end()); // group index for each candidate vector groupId(candidateTree.size(), -1); vector > groupedCandidates; @@ -728,11 +728,11 @@ struct ArucoDetector::ArucoDetectorImpl { for (vector& grouped : groupedCandidates) { if (detectorParams.detectInvertedMarker) // if detectInvertedMarker choose smallest contours - std::sort(grouped.begin(), grouped.end(), [](const size_t &a, const size_t &b) { + std::stable_sort(grouped.begin(), grouped.end(), [](const size_t &a, const size_t &b) { return a > b; }); else // if detectInvertedMarker==false choose largest contours - std::sort(grouped.begin(), grouped.end()); + std::stable_sort(grouped.begin(), grouped.end()); size_t currId = grouped[0]; isSelectedContours[currId] = true; for (size_t i = 1ull; i < grouped.size(); i++) { From c955564cb3f8b28915a3ebe7e5f04d9fb78926bb Mon Sep 17 00:00:00 2001 From: Yuantao Feng Date: Tue, 9 Jan 2024 10:00:17 -0600 Subject: [PATCH 28/57] Merge pull request #24765 from fengyuentau:mod_operator dnn onnx: add mod #24765 Resolves https://github.com/opencv/opencv/issues/23174 TODO: - [x] enable some conformance tests - [x] add backends - [x] CANN - [x] OpenVINO - [x] CUDA ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- modules/dnn/src/cuda/eltwise_ops.cu | 14 +++++ modules/dnn/src/cuda/functors.hpp | 34 +++++++++++ modules/dnn/src/cuda/math.hpp | 7 +++ .../dnn/src/cuda4dnn/kernels/eltwise_ops.hpp | 6 ++ .../dnn/src/cuda4dnn/primitives/eltwise.hpp | 6 ++ .../dnn/src/layers/nary_eltwise_layers.cpp | 59 +++++++++++++++---- modules/dnn/src/onnx/onnx_importer.cpp | 7 ++- ...conformance_layer_filter__openvino.inl.hpp | 15 +++++ ..._conformance_layer_parser_denylist.inl.hpp | 3 - 9 files changed, 137 insertions(+), 14 deletions(-) diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu index 16f6cccf6b..8a861b3067 100644 --- a/modules/dnn/src/cuda/eltwise_ops.cu +++ b/modules/dnn/src/cuda/eltwise_ops.cu @@ -324,7 +324,19 @@ void eltwise_sub_2(const Stream& stream, TensorSpan output, TensorView x, eltwise_op>(stream, output, x, y); } +template +void eltwise_mod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y) { + eltwise_op>(stream, output, x, y); +} + +template +void eltwise_fmod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y) { + eltwise_op>(stream, output, x, y); +} + #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) + template void eltwise_mod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y); + template void eltwise_fmod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y); template void eltwise_sub_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y); template void eltwise_div_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y); template void eltwise_prod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y); @@ -333,6 +345,8 @@ void eltwise_sub_2(const Stream& stream, TensorSpan output, TensorView x, template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y); template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y); #endif + template void eltwise_mod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_fmod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); template void eltwise_sub_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); template void eltwise_div_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); template void eltwise_prod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); diff --git a/modules/dnn/src/cuda/functors.hpp b/modules/dnn/src/cuda/functors.hpp index 2df32030f0..cada43387e 100644 --- a/modules/dnn/src/cuda/functors.hpp +++ b/modules/dnn/src/cuda/functors.hpp @@ -799,6 +799,40 @@ struct ReciprocalFunctor { } }; +template +struct ModFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() {} + }; + + CUDA4DNN_DEVICE ModFunctor() { } + CUDA4DNN_DEVICE ModFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T x, T y) { + int res = (int)x % (int)y; + T zero = T(0); + if ((res > (int)zero && y < zero) || (res < (int)zero && y > zero)) { + res += (int)y; + } + return res; + } +}; + +template +struct FModFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() {} + }; + + CUDA4DNN_DEVICE FModFunctor() { } + CUDA4DNN_DEVICE FModFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T x, T y) { + using csl::device::fmod; + return fmod(x, y); + } +}; + }}}} /* namespace cv::dnn::cuda4dnn::kernels */ #endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */ diff --git a/modules/dnn/src/cuda/math.hpp b/modules/dnn/src/cuda/math.hpp index 0a312a250d..8e4f091f4f 100644 --- a/modules/dnn/src/cuda/math.hpp +++ b/modules/dnn/src/cuda/math.hpp @@ -36,6 +36,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace de template <> inline __device__ float min(float x, float y) { return fminf(x, y); } template <> inline __device__ double min(double x, double y) { return fmin(x, y); } + template __device__ T fmod(T x, T y) { return x % y; } + template <> inline __device__ float fmod(float x, float y) { return fmodf(x, y); } + template <> inline __device__ double fmod(double x, double y) { return fmod(x, y); } +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) + template <> inline __device__ half fmod(half x, half y) { return fmodf((float)x, (float)y); } +#endif + template __device__ T log1p(T val); #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) template <> inline __device__ __half log1p(__half val) { return hlog(__half(1) + val); } diff --git a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp index 3dc3355b3b..e80db943ae 100644 --- a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp +++ b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp @@ -33,6 +33,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template void eltwise_sub_2(const csl::Stream& stream, csl::TensorSpan output, csl::TensorView x, csl::TensorView y); + template + void eltwise_mod_2(const csl::Stream& stream, csl::TensorSpan output, csl::TensorView x, csl::TensorView y); + + template + void eltwise_fmod_2(const csl::Stream& stream, csl::TensorSpan output, csl::TensorView x, csl::TensorView y); + }}}} /* namespace cv::dnn::cuda4dnn::kernels */ #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp index 05bca83820..5822f48061 100644 --- a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp @@ -28,6 +28,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { DIV, MIN, SUB, + MOD, + FMOD, }; class EltwiseOpBase : public CUDABackendNode { @@ -90,6 +92,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { kernels::eltwise_sum_coeff_2(stream, output, coeffs[0], input_x, coeffs[1], input_y); break; case EltwiseOpType::SUB: kernels::eltwise_sub_2(stream, output, input_x, input_y); break; + case EltwiseOpType::MOD: kernels::eltwise_mod_2(stream, output, input_x, input_y); break; + case EltwiseOpType::FMOD: kernels::eltwise_fmod_2(stream, output, input_x, input_y); break; } } else @@ -122,6 +126,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { } break; case EltwiseOpType::SUB: kernels::eltwise_sub_2(stream, output, output, input); break; + case EltwiseOpType::MOD: kernels::eltwise_mod_2(stream, output, output, input); break; + case EltwiseOpType::FMOD: kernels::eltwise_fmod_2(stream, output, output, input); break; } } } diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp index c988ec69f2..661861cbe3 100644 --- a/modules/dnn/src/layers/nary_eltwise_layers.cpp +++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp @@ -24,6 +24,16 @@ namespace cv namespace dnn { +namespace { +static int _mod(int x, int y) { + int res = x % y; + if ((res < 0 && y > 0) || (res > 0 && y < 0)) { + res += y; + } + return res; +} +} + class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer { public: @@ -42,7 +52,8 @@ public: MAX, MEAN, MIN, - MOD, + MOD, // Integer Mod. Reminder's sign = Divisor's sign. + FMOD, // Floating-point Mod. Reminder's sign = Dividend's sign. PROD, SUB, SUM, @@ -79,6 +90,8 @@ public: op = OPERATION::MIN; else if (operation == "mod") op = OPERATION::MOD; + else if (operation == "fmod") + op = OPERATION::FMOD; else if (operation == "mul") op = OPERATION::PROD; else if (operation == "sub") @@ -106,18 +119,21 @@ public: #ifdef HAVE_CANN if (backendId == DNN_BACKEND_CANN) return op == OPERATION::ADD || op == OPERATION::PROD || op == OPERATION::SUB || - op == OPERATION::DIV || op == OPERATION::MAX || op == OPERATION::MIN; + op == OPERATION::DIV || op == OPERATION::MAX || op == OPERATION::MIN || + op == OPERATION::MOD || op == OPERATION::FMOD; #endif if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) return (op == OPERATION::ADD || op == OPERATION::PROD || op == OPERATION::GREATER_EQUAL || - op == OPERATION::LESS_EQUAL + op == OPERATION::LESS_EQUAL || + op == OPERATION::MOD || + op == OPERATION::FMOD ); if (backendId == DNN_BACKEND_CUDA) { - return op == OPERATION::MAX || op == OPERATION::MIN || op == OPERATION::SUM || - op == OPERATION::PROD || op == OPERATION::DIV || op == OPERATION::ADD || - op == OPERATION::SUB; + return op == OPERATION::MAX || op == OPERATION::MIN || op == OPERATION::SUM || + op == OPERATION::PROD || op == OPERATION::DIV || op == OPERATION::ADD || + op == OPERATION::SUB || op == OPERATION::MOD || op == OPERATION::FMOD; } return backendId == DNN_BACKEND_OPENCV; } @@ -703,10 +719,16 @@ public: } case OPERATION::MOD: { - auto mod = [](const uint8_t &a, const uint8_t &b) { return a % b; }; + auto mod = [] (const T &a, const T &b) { return static_cast(_mod(int(a), int(b))); }; binary_forward(mod, std::forward(args)...); break; } + case OPERATION::FMOD: + { + auto fmod = [](const T &a, const T &b) { return std::fmod(a, b); }; + binary_forward(fmod, std::forward(args)...); + break; + } case OPERATION::PROD: { auto prod = [](const T &a, const T &b) { return a * b; }; @@ -778,9 +800,8 @@ public: opDispatch(std::forward(args)...); break; case CV_32F: - CV_Assert(op != OPERATION::BITSHIFT && op != OPERATION::MOD && - op != OPERATION::AND && op != OPERATION::OR && - op != OPERATION::XOR); + CV_Assert(op != OPERATION::BITSHIFT && op != OPERATION::AND && + op != OPERATION::OR && op != OPERATION::XOR); opDispatch(std::forward(args)...); break; default: @@ -833,6 +854,12 @@ public: case OPERATION::SUB: op_ = cuda4dnn::EltwiseOpType::SUB; break; + case OPERATION::MOD: + op_ = cuda4dnn::EltwiseOpType::MOD; + break; + case OPERATION::FMOD: + op_ = cuda4dnn::EltwiseOpType::FMOD; + break; default: return Ptr(); // return empty cuda_node if the EltwiseOpType is unsupported type. }; @@ -877,6 +904,8 @@ public: BUILD_CANN_ELTWISE_OP(OPERATION::DIV, Xdivy, name); BUILD_CANN_ELTWISE_OP(OPERATION::MAX, Maximum, name); BUILD_CANN_ELTWISE_OP(OPERATION::MIN, Minimum, name); + BUILD_CANN_ELTWISE_OP(OPERATION::MOD, Mod, name); + BUILD_CANN_ELTWISE_OP(OPERATION::FMOD, Mod, name); #undef BUILD_CANN_ELTWISE_OP default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation"); } @@ -923,6 +952,16 @@ public: node = std::make_shared(inp0, inp1); else if (op == OPERATION::LESS_EQUAL) node = std::make_shared(inp0, inp1); + // Ideally we should do this but int32 internal blobs are converted to float32 data type in inference. + // TODO: Remove data type convertion when we have type inference. + else if (op == OPERATION::MOD) { + auto inp0_i64 = std::make_shared(inp0, ngraph::element::i64); + auto inp1_i64 = std::make_shared(inp1, ngraph::element::i64); + auto mod = std::make_shared(inp0_i64, inp1_i64); + node = std::make_shared(mod, ngraph::element::f32); + } + else if (op == OPERATION::FMOD) + node = std::make_shared(inp0, inp1); else CV_Error(Error::StsNotImplemented, "Operation is not implemented for nGraph backend"); return Ptr(new InfEngineNgraphNode(node)); diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp index 115738999a..f0b33d111b 100644 --- a/modules/dnn/src/onnx/onnx_importer.cpp +++ b/modules/dnn/src/onnx/onnx_importer.cpp @@ -2830,6 +2830,11 @@ void ONNXImporter::parseElementWise(LayerParams& layerParams, const opencv_onnx: layerParams.type = "NaryEltwise"; layerParams.set("operation", toLowerCase(node_proto.op_type())); + if (node_proto.op_type() == "Mod") { + if (layerParams.get("fmod", 0)) { + layerParams.set("operation", "fmod"); + }; + } // element-wise layers that can have >=1 inputs but actually have one input if (node_proto.input_size() == 1 && (op_type == "max" || op_type == "min" || op_type == "mean" || op_type == "sum")) @@ -4006,7 +4011,7 @@ void ONNXImporter::buildDispatchMap_ONNX_AI(int opset_version) dispatch["Equal"] = dispatch["Greater"] = dispatch["Less"] = dispatch["Pow"] = dispatch["Add"] = dispatch["Sub"] = dispatch["Mul"] = dispatch["Div"] = dispatch["GreaterOrEqual"] = - dispatch["LessOrEqual"] = &ONNXImporter::parseElementWise; + dispatch["LessOrEqual"] = dispatch["Mod"] = &ONNXImporter::parseElementWise; dispatch["Sum"] = dispatch["Min"] = dispatch["Max"] = &ONNXImporter::parseElementWise; dispatch["Where"] = &ONNXImporter::parseElementWise; diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp index 17d561d64b..199bfdcd18 100644 --- a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp +++ b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp @@ -1056,10 +1056,25 @@ CASE(test_mod_int64_fmod) // no filter CASE(test_mod_mixed_sign_float16) // no filter + if (target == DNN_TARGET_OPENCL) + { + default_l1 = 0.0011; // Expected: (normL1) <= (l1), actual: 0.00104141 vs 1e-05 + default_lInf = 0.0016; // Expected: (normInf) <= (lInf), actual: 0.00156212 vs 0.0001 + } CASE(test_mod_mixed_sign_float32) // no filter + if (target == DNN_TARGET_OPENCL) + { + default_l1 = 0.0011; // Expected: (normL1) <= (l1), actual: 0.00104141 vs 1e-05 + default_lInf = 0.0016; // Expected: (normInf) <= (lInf), actual: 0.00156212 vs 0.0001 + } CASE(test_mod_mixed_sign_float64) // no filter + if (target == DNN_TARGET_OPENCL) + { + default_l1 = 0.0011; // Expected: (normL1) <= (l1), actual: 0.00104167 vs 1e-05 + default_lInf = 0.0016; // Expected: (normInf) <= (lInf), actual: 0.00156251 vs 0.0001 + } CASE(test_mod_mixed_sign_int16) // no filter CASE(test_mod_mixed_sign_int32) diff --git a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp index be60c38b86..68f49e5fa4 100644 --- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp +++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp @@ -210,9 +210,6 @@ "test_min_uint8", "test_mod_broadcast", "test_mod_int64_fmod", -"test_mod_mixed_sign_float16", -"test_mod_mixed_sign_float32", -"test_mod_mixed_sign_float64", "test_mod_mixed_sign_int16", "test_mod_mixed_sign_int32", "test_mod_mixed_sign_int64", From 2beacc07e8975a91aac540ab3d053dca103aaf94 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Tue, 9 Jan 2024 18:08:15 +0300 Subject: [PATCH 29/57] Manage Python Limited API version externally. --- CMakeLists.txt | 1 + cmake/OpenCVDetectPython.cmake | 6 ++++++ modules/python/common.cmake | 1 + modules/python/python3/CMakeLists.txt | 9 --------- modules/python/src2/cv2.hpp | 5 ++++- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d0b9c7d194..0b87773865 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1866,6 +1866,7 @@ if(BUILD_opencv_python3) else() status(" Libraries:" HAVE_opencv_python3 THEN "${PYTHON3_LIBRARIES}" ELSE NO) endif() + status(" Limited API:" PYTHON3_LIMITED_API THEN "YES (ver ${PYTHON3_LIMITED_API_VERSION})" ELSE NO) status(" numpy:" PYTHON3_NUMPY_INCLUDE_DIRS THEN "${PYTHON3_NUMPY_INCLUDE_DIRS} (ver ${PYTHON3_NUMPY_VERSION})" ELSE "NO (Python3 wrappers can not be generated)") status(" install path:" HAVE_opencv_python3 THEN "${__INSTALL_PATH_PYTHON3}" ELSE "-") endif() diff --git a/cmake/OpenCVDetectPython.cmake b/cmake/OpenCVDetectPython.cmake index a6aacb4de4..839ec1148d 100644 --- a/cmake/OpenCVDetectPython.cmake +++ b/cmake/OpenCVDetectPython.cmake @@ -291,6 +291,12 @@ find_python("${OPENCV_PYTHON3_VERSION}" "${MIN_VER_PYTHON3}" PYTHON3_LIBRARY PYT PYTHON3_INCLUDE_DIR PYTHON3_INCLUDE_DIR2 PYTHON3_PACKAGES_PATH PYTHON3_NUMPY_INCLUDE_DIRS PYTHON3_NUMPY_VERSION) +# Problem in numpy >=1.15 <1.17 +OCV_OPTION(PYTHON3_LIMITED_API "Build with Python Limited API (not available with numpy >=1.15 <1.17)" NO + VISIBLE_IF PYTHON3_NUMPY_VERSION VERSION_LESS "1.15" OR NOT PYTHON3_NUMPY_VERSION VERSION_LESS "1.17") +if(PYTHON3_LIMITED_API) + set(PYTHON3_LIMITED_API_VERSION "0x03060000" CACHE STRING "Minimal Python version for Limited API") +endif() if(PYTHON_DEFAULT_EXECUTABLE) set(PYTHON_DEFAULT_AVAILABLE "TRUE") diff --git a/modules/python/common.cmake b/modules/python/common.cmake index a233fe0232..cd6c27984a 100644 --- a/modules/python/common.cmake +++ b/modules/python/common.cmake @@ -46,6 +46,7 @@ if(${PYTHON}_LIMITED_API) # support only python3.3+ ocv_assert(${PYTHON}_VERSION_MAJOR EQUAL 3 AND ${PYTHON}_VERSION_MINOR GREATER 2) target_compile_definitions(${the_module} PRIVATE CVPY_DYNAMIC_INIT) + target_compile_definitions(${the_module} PRIVATE PYTHON3_LIMITED_API_VERSION=${PYTHON3_LIMITED_API_VERSION}) if(WIN32) string(REPLACE "python${${PYTHON}_VERSION_MAJOR}${${PYTHON}_VERSION_MINOR}.lib" diff --git a/modules/python/python3/CMakeLists.txt b/modules/python/python3/CMakeLists.txt index d95af21e04..da86ba5c5e 100644 --- a/modules/python/python3/CMakeLists.txt +++ b/modules/python/python3/CMakeLists.txt @@ -2,15 +2,6 @@ if(NOT PYTHON3_INCLUDE_PATH OR NOT PYTHON3_NUMPY_INCLUDE_DIRS) ocv_module_disable(python3) endif() -# Problem in numpy >=1.15 <1.17 -if(PYTHON3_LIMITED_API - AND NOT PYTHON3_NUMPY_VERSION VERSION_LESS "1.15" - AND PYTHON3_NUMPY_VERSION VERSION_LESS "1.17" - ) - message(WARNING "Current NUMPY version (${PYTHON3_NUMPY_VERSION}) is not compatible with LIMITED_API.") - set(PYTHON3_LIMITED_API OFF) -endif() - set(the_description "The python3 bindings") set(MODULE_NAME python3) set(MODULE_INSTALL_SUBDIR python3) diff --git a/modules/python/src2/cv2.hpp b/modules/python/src2/cv2.hpp index b7992582ad..2697c781ca 100644 --- a/modules/python/src2/cv2.hpp +++ b/modules/python/src2/cv2.hpp @@ -13,7 +13,10 @@ // #define Py_DEBUG #if defined(CVPY_DYNAMIC_INIT) && !defined(Py_DEBUG) -# define Py_LIMITED_API 0x03030000 +# ifndef PYTHON3_LIMITED_API_VERSION +# define PYTHON3_LIMITED_API_VERSION 0x03060000 +# endif +# define Py_LIMITED_API PYTHON3_LIMITED_API_VERSION #endif #include From 7fb336322db5926ef5fba9ec9a18be3da9905813 Mon Sep 17 00:00:00 2001 From: Yuantao Feng Date: Wed, 10 Jan 2024 04:01:00 -0600 Subject: [PATCH 30/57] Merge pull request #24808 from fengyuentau:fix_layernorm dnn: no layer norm fusion if axes.back() is not the axis of last dimension #24808 Merge with https://github.com/opencv/opencv_extra/pull/1137 Resolves https://github.com/opencv/opencv/issues/24797 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- .../dnn/src/onnx/onnx_graph_simplifier.cpp | 56 ++++++++++++++++--- modules/dnn/test/test_graph_simplifier.cpp | 4 ++ modules/dnn/test/test_onnx_importer.cpp | 4 ++ 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp index 77dc1c52df..7b8dd483c7 100644 --- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp +++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp @@ -86,6 +86,7 @@ public: int getTensorShapeSize(int node_id, int node_input_id) { const auto node = getNode(node_id); const auto &input_name = node->getInputName(node_input_id); + // try to get from value_info for (int i = 0; i < net.value_info_size(); i++) { const auto value_info = net.value_info(i); if (value_info.name() == input_name) { @@ -97,6 +98,18 @@ public: } } } + // try to get from input + for (int i = 0; i < net.input_size(); i++) { + const auto input = net.input(i); + if (input.name() == input_name) { + if (input.has_type() && input.type().has_tensor_type() && + input.type().tensor_type().has_shape()) { + return input.type().tensor_type().shape().dim_size(); + } else { + return -1; + } + } + } return -1; } @@ -660,6 +673,10 @@ private: [Input] -> LayerNorm -> [Output] \ [weight], [bias] + + Note: axes of ReduceMean must be: + - last element is the axis of last dimension (-1 or (input_ndims - 1)) + - a list of adjacent axes, e.g. [1, 2, 3, ..., input_ndims - 1] */ class LayerNormSubGraph : public Subgraph { @@ -683,19 +700,22 @@ public: setFusedNode("LayerNormalization", input); } - static float extractAxis(const Ptr& net, int node_id) + static std::vector extractAxis(const Ptr& net, int node_id) { + // TODO: consider ReduceMean-18 which has axes as one of the inputs instead of attributes Ptr mean_ptr = net->getNode(node_id); opencv_onnx::NodeProto* mean_node = mean_ptr.dynamicCast()->node; - int axis_ = -1; + std::vector axes; for (int i = 0; i < mean_node->attribute_size(); i++) { opencv_onnx::AttributeProto attr = mean_node->attribute(i); if (attr.name() != "axes") continue; - axis_ = static_cast(attr.ints(0)); + for (int j = 0; j < attr.ints_size(); j++) { + axes.push_back(attr.ints(j)); + } } - return axis_; + return axes; } virtual bool match(const Ptr& net, int nodeId, @@ -707,11 +727,31 @@ public: if (pow_exp - 2 > 1e-5) // not pow(2) return false; - int axis_mean1 = extractAxis(net, matchedNodesIds[mean]); - int axis_mean2 = extractAxis(net, matchedNodesIds[mean1]); - if (axis_mean1 != axis_mean2) + std::vector axes = extractAxis(net, matchedNodesIds[mean]); + // check whether it is -1 or last_axis or [axis, ..., last_axis] + int64_t input_ndims = static_cast(net.dynamicCast()->getTensorShapeSize(matchedNodesIds[mean], 0)); + if (input_ndims == -1) { + return false; // input shape unknown + } + // assume that axes are sorted in ascending order, e.g. [0, 1, 2, 3] or [-3, -2, -1] + if (axes.back() != -1 && axes.back() != (input_ndims - 1)) { return false; - axis = axis_mean1; + } + for (size_t i = 0; i < axes.size() - 1; i++) { + if (axes[i] - axes[i + 1] != -1) { + return false; + } + } + + std::vector axes1 = extractAxis(net, matchedNodesIds[mean1]); + if (axes.size() != axes1.size()) + return false; + for (size_t i = 0; i < axes.size(); i++) { + if (((axes[i] + input_ndims) % input_ndims) != ((axes1[i] + input_ndims) % input_ndims)) { + return false; + } + } + axis = axes[0]; epsilon = extractConstant(net, matchedNodesIds[add], 1).at(0); diff --git a/modules/dnn/test/test_graph_simplifier.cpp b/modules/dnn/test/test_graph_simplifier.cpp index e09a68c158..91b4e271f5 100644 --- a/modules/dnn/test/test_graph_simplifier.cpp +++ b/modules/dnn/test/test_graph_simplifier.cpp @@ -47,6 +47,10 @@ TEST_F(Test_Graph_Simplifier, LayerNormSubGraph) { test("layer_norm_expanded_with_initializers", "LayerNormalization"); } +TEST_F(Test_Graph_Simplifier, LayerNormNoFusionSubGraph) { + test("layer_norm_no_fusion", std::vector{"NaryEltwise", "Reduce", "Sqrt"}); +} + TEST_F(Test_Graph_Simplifier, ResizeSubgraph) { /* Test for 6 subgraphs: - GatherCastSubgraph diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index 457b151ccf..4d56cb0e17 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -3024,6 +3024,10 @@ TEST_P(Test_ONNX_nets, VitTrack) { normAssert(ref_output3, outputs[2], "VitTrack output3"); } +TEST_P(Test_ONNX_layers, LayerNormNoFusion) { + testONNXModels("layer_norm_no_fusion"); +} + INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_nets, dnnBackendsAndTargets()); }} // namespace From fde99e68c02d10401f1d7c85c34919760cf910f7 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Wed, 10 Jan 2024 16:22:03 +0300 Subject: [PATCH 31/57] Removed Android AIDL from build scrips and tutorials as it's not needed since 4.9.0. --- cmake/android/android_gradle_projects.cmake | 4 ---- .../android_binary_package/dev_with_OCV_on_Android.markdown | 3 +-- modules/java/CMakeLists.txt | 4 +--- modules/java/android_sdk/android_gradle_lib/build.gradle | 1 - modules/java/android_sdk/build.gradle.in | 4 ---- modules/java/generator/gen_java.py | 4 ++-- platforms/android/aar-template/OpenCV/build.gradle.template | 4 +--- samples/android/15-puzzle/build.gradle.in | 1 - samples/android/camera-calibration/build.gradle.in | 1 - samples/android/color-blob-detection/build.gradle.in | 1 - samples/android/face-detection/build.gradle.in | 1 - samples/android/image-manipulations/build.gradle.in | 1 - samples/android/mobilenet-objdetect/build.gradle.in | 1 - samples/android/qr-detection/build.gradle.in | 1 - samples/android/tutorial-1-camerapreview/build.gradle.in | 1 - samples/android/tutorial-2-mixedprocessing/build.gradle.in | 1 - samples/android/tutorial-3-cameracontrol/build.gradle.in | 1 - samples/android/tutorial-4-opencl/build.gradle.in | 1 - samples/android/video-recorder/build.gradle.in | 1 - 19 files changed, 5 insertions(+), 31 deletions(-) diff --git a/cmake/android/android_gradle_projects.cmake b/cmake/android/android_gradle_projects.cmake index 9b3e7fbc9f..2b79806fa8 100644 --- a/cmake/android/android_gradle_projects.cmake +++ b/cmake/android/android_gradle_projects.cmake @@ -89,15 +89,11 @@ else() ocv_update(OPENCV_ANDROID_NAMESPACE_DECLARATION "") endif() -# set android gradle java version in build.gradle and set aidl config if(NOT (ANDROID_GRADLE_PLUGIN_VERSION VERSION_LESS "8.0.0")) # AGP-8.0 requires a minimum JDK version of JDK17 ocv_update(ANDROID_GRADLE_JAVA_VERSION_INIT "17") - # Enable aidl configuration for OpenCV compile with AGP-8.0 - ocv_update(ANDROID_GRADLE_BUILD_FEATURE_AIDL "buildFeatures { aidl true }") else() ocv_update(ANDROID_GRADLE_JAVA_VERSION_INIT "1_8") - ocv_update(ANDROID_GRADLE_BUILD_FEATURE_AIDL "") endif() set(ANDROID_GRADLE_JAVA_VERSION "${ANDROID_GRADLE_JAVA_VERSION_INIT}" CACHE STRING "Android Gradle Java version") diff --git a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown index 68d7ab3644..455745db6a 100644 --- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown +++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown @@ -75,11 +75,10 @@ In addition to this instruction you can use some video guide, for example [this @endcode The fix was found [here](https://stackoverflow.com/questions/73225714/import-opencv-sdk-to-android-studio-chipmunk) -6. OpenCV project uses `aidl` and `buildConfig` features. Please enable them in +6. OpenCV project uses `buildConfig` feature. Please enable it in `MyApplication/OpenCV/build.gradle` file to `android` block: @code{.gradle} buildFeatures{ - aidl true buildConfig true } diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt index 7fe90a0cb3..7207997e1b 100644 --- a/modules/java/CMakeLists.txt +++ b/modules/java/CMakeLists.txt @@ -17,7 +17,7 @@ ocv_add_module(java BINDINGS opencv_core opencv_imgproc PRIVATE_REQUIRED opencv_ include(${CMAKE_CURRENT_SOURCE_DIR}/common.cmake) -# UTILITY: glob specific sources and append them to list (type is in H, CPP, JAVA, AIDL) +# UTILITY: glob specific sources and append them to list (type is in H, CPP, JAVA) macro(glob_more_specific_sources _type _root _output) unset(_masks) if(${_type} STREQUAL "H") @@ -26,8 +26,6 @@ macro(glob_more_specific_sources _type _root _output) set(_masks "${_root}/cpp/*.cpp") elseif(${_type} STREQUAL "JAVA") set(_masks "${_root}/java/*.java" "${_root}/java/*.java.in") - elseif(${_type} STREQUAL "AIDL") - set(_masks "${_root}/java/*.aidl") endif() if (_masks) file(GLOB _result ${_masks}) diff --git a/modules/java/android_sdk/android_gradle_lib/build.gradle b/modules/java/android_sdk/android_gradle_lib/build.gradle index b887cdb4b9..4394bd9a4e 100644 --- a/modules/java/android_sdk/android_gradle_lib/build.gradle +++ b/modules/java/android_sdk/android_gradle_lib/build.gradle @@ -42,7 +42,6 @@ android { main { jniLibs.srcDirs = ['../../jni'] java.srcDirs = ['src'] // TODO Use original files instead of copied into build directory - aidl.srcDirs = ['src'] res.srcDirs = ['@OpenCV_SOURCE_DIR@/modules/java/android_sdk/android_gradle_lib/res'] manifest.srcFile 'AndroidManifest.xml' } diff --git a/modules/java/android_sdk/build.gradle.in b/modules/java/android_sdk/build.gradle.in index 15bfdbefc1..d3e37d0362 100644 --- a/modules/java/android_sdk/build.gradle.in +++ b/modules/java/android_sdk/build.gradle.in @@ -121,8 +121,6 @@ android { targetCompatibility JavaVersion.VERSION_@ANDROID_GRADLE_JAVA_VERSION_INIT@ } - @ANDROID_GRADLE_BUILD_FEATURE_AIDL@ - buildTypes { debug { packagingOptions { @@ -139,7 +137,6 @@ android { } buildFeatures { - aidl true prefabPublishing true buildConfig true } @@ -153,7 +150,6 @@ android { main { jniLibs.srcDirs = ['native/libs'] java.srcDirs = ['java/src'] - aidl.srcDirs = ['java/src'] res.srcDirs = ['java/res'] manifest.srcFile 'java/AndroidManifest.xml' } diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py index c893c0833a..88523b2c31 100755 --- a/modules/java/generator/gen_java.py +++ b/modules/java/generator/gen_java.py @@ -1240,13 +1240,13 @@ JNIEXPORT void JNICALL Java_org_opencv_%(module)s_%(j_cls)s_delete def copy_java_files(java_files_dir, java_base_path, default_package_path='org/opencv/'): global total_files, updated_files java_files = [] - re_filter = re.compile(r'^.+\.(java|aidl|kt)(.in)?$') + re_filter = re.compile(r'^.+\.(java|kt)(.in)?$') for root, dirnames, filenames in os.walk(java_files_dir): java_files += [os.path.join(root, filename) for filename in filenames if re_filter.match(filename)] java_files = [f.replace('\\', '/') for f in java_files] re_package = re.compile(r'^package +(.+);') - re_prefix = re.compile(r'^.+[\+/]([^\+]+).(java|aidl|kt)(.in)?$') + re_prefix = re.compile(r'^.+[\+/]([^\+]+).(java|kt)(.in)?$') for java_file in java_files: src = checkFileRemap(java_file) with open(src, 'r') as f: diff --git a/platforms/android/aar-template/OpenCV/build.gradle.template b/platforms/android/aar-template/OpenCV/build.gradle.template index 4f3a3846ec..23d88a6910 100644 --- a/platforms/android/aar-template/OpenCV/build.gradle.template +++ b/platforms/android/aar-template/OpenCV/build.gradle.template @@ -39,7 +39,6 @@ android { } } buildFeatures { - aidl true prefabPublishing true buildConfig true } @@ -52,7 +51,6 @@ android { main { java.srcDirs = ['src/main/java'] //jniLibs.srcDirs = ['libs'] - aidl.srcDirs = ['src/main/java'] } } @@ -85,4 +83,4 @@ publishing { } dependencies { -} \ No newline at end of file +} diff --git a/samples/android/15-puzzle/build.gradle.in b/samples/android/15-puzzle/build.gradle.in index b2a8975dce..e7f6b4af56 100644 --- a/samples/android/15-puzzle/build.gradle.in +++ b/samples/android/15-puzzle/build.gradle.in @@ -19,7 +19,6 @@ android { sourceSets { main { java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ - aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ res.srcDirs = @ANDROID_SAMPLE_RES_PATH@ manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@' } diff --git a/samples/android/camera-calibration/build.gradle.in b/samples/android/camera-calibration/build.gradle.in index 3cd3d9200e..8c97fb22ab 100644 --- a/samples/android/camera-calibration/build.gradle.in +++ b/samples/android/camera-calibration/build.gradle.in @@ -19,7 +19,6 @@ android { sourceSets { main { java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ - aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ res.srcDirs = @ANDROID_SAMPLE_RES_PATH@ manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@' } diff --git a/samples/android/color-blob-detection/build.gradle.in b/samples/android/color-blob-detection/build.gradle.in index b0c50859f9..bd29338970 100644 --- a/samples/android/color-blob-detection/build.gradle.in +++ b/samples/android/color-blob-detection/build.gradle.in @@ -19,7 +19,6 @@ android { sourceSets { main { java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ - aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ res.srcDirs = @ANDROID_SAMPLE_RES_PATH@ manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@' } diff --git a/samples/android/face-detection/build.gradle.in b/samples/android/face-detection/build.gradle.in index 6fc4ce26c7..72dabd664d 100644 --- a/samples/android/face-detection/build.gradle.in +++ b/samples/android/face-detection/build.gradle.in @@ -19,7 +19,6 @@ android { sourceSets { main { java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ - aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ res.srcDirs = @ANDROID_SAMPLE_RES_PATH@ manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@' } diff --git a/samples/android/image-manipulations/build.gradle.in b/samples/android/image-manipulations/build.gradle.in index b12701f377..3c5034ea9b 100644 --- a/samples/android/image-manipulations/build.gradle.in +++ b/samples/android/image-manipulations/build.gradle.in @@ -19,7 +19,6 @@ android { sourceSets { main { java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ - aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ res.srcDirs = @ANDROID_SAMPLE_RES_PATH@ manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@' } diff --git a/samples/android/mobilenet-objdetect/build.gradle.in b/samples/android/mobilenet-objdetect/build.gradle.in index 377d151d1b..9e8e49b668 100644 --- a/samples/android/mobilenet-objdetect/build.gradle.in +++ b/samples/android/mobilenet-objdetect/build.gradle.in @@ -19,7 +19,6 @@ android { sourceSets { main { java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ - aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ res.srcDirs = @ANDROID_SAMPLE_RES_PATH@ manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@' } diff --git a/samples/android/qr-detection/build.gradle.in b/samples/android/qr-detection/build.gradle.in index 469dea609c..0951b70cdb 100644 --- a/samples/android/qr-detection/build.gradle.in +++ b/samples/android/qr-detection/build.gradle.in @@ -19,7 +19,6 @@ android { sourceSets { main { java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ - aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ res.srcDirs = @ANDROID_SAMPLE_RES_PATH@ manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@' } diff --git a/samples/android/tutorial-1-camerapreview/build.gradle.in b/samples/android/tutorial-1-camerapreview/build.gradle.in index deb9cf1b03..7b308b2abb 100644 --- a/samples/android/tutorial-1-camerapreview/build.gradle.in +++ b/samples/android/tutorial-1-camerapreview/build.gradle.in @@ -19,7 +19,6 @@ android { sourceSets { main { java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ - aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ res.srcDirs = @ANDROID_SAMPLE_RES_PATH@ manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@' } diff --git a/samples/android/tutorial-2-mixedprocessing/build.gradle.in b/samples/android/tutorial-2-mixedprocessing/build.gradle.in index e0f0a6b3c7..a156f42240 100644 --- a/samples/android/tutorial-2-mixedprocessing/build.gradle.in +++ b/samples/android/tutorial-2-mixedprocessing/build.gradle.in @@ -33,7 +33,6 @@ android { sourceSets { main { java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ - aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ res.srcDirs = @ANDROID_SAMPLE_RES_PATH@ manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@' } diff --git a/samples/android/tutorial-3-cameracontrol/build.gradle.in b/samples/android/tutorial-3-cameracontrol/build.gradle.in index d83f37d74e..d9c7f29ac3 100644 --- a/samples/android/tutorial-3-cameracontrol/build.gradle.in +++ b/samples/android/tutorial-3-cameracontrol/build.gradle.in @@ -19,7 +19,6 @@ android { sourceSets { main { java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ - aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ res.srcDirs = @ANDROID_SAMPLE_RES_PATH@ manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@' } diff --git a/samples/android/tutorial-4-opencl/build.gradle.in b/samples/android/tutorial-4-opencl/build.gradle.in index d9c8cda60e..8eeb12b17d 100644 --- a/samples/android/tutorial-4-opencl/build.gradle.in +++ b/samples/android/tutorial-4-opencl/build.gradle.in @@ -35,7 +35,6 @@ android { sourceSets { main { java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ - aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ res.srcDirs = @ANDROID_SAMPLE_RES_PATH@ manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@' } diff --git a/samples/android/video-recorder/build.gradle.in b/samples/android/video-recorder/build.gradle.in index 506d98ce44..d096f3190a 100644 --- a/samples/android/video-recorder/build.gradle.in +++ b/samples/android/video-recorder/build.gradle.in @@ -19,7 +19,6 @@ android { sourceSets { main { java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ - aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@ res.srcDirs = @ANDROID_SAMPLE_RES_PATH@ manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@' } From 72aa3f2bc5f8e47b3514a8cba4f7ba365d004dba Mon Sep 17 00:00:00 2001 From: Phil Nelson Date: Wed, 10 Jan 2024 10:18:52 -0800 Subject: [PATCH 32/57] Update README.md - remove IndieGoGo, add Support OpenCV page --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index bf4010ddf2..e812c37393 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,5 @@ ## OpenCV: Open Source Computer Vision Library -### Keep OpenCV Free - -OpenCV is raising funds to keep the library free for everyone, and we need the support of the entire community to do it. [Donate to OpenCV on IndieGoGo](http://igg.me/at/opencv5) before the campaign ends on December 16 to show your support. ### Resources @@ -13,6 +10,7 @@ OpenCV is raising funds to keep the library free for everyone, and we need the s * previous forum (read only): * Issue tracking: * Additional OpenCV functionality: +* Donate to OpenCV: ### Contributing From 83acb656f1191792ed31aad62a7eb2c4bcb4661b Mon Sep 17 00:00:00 2001 From: fengyuentau Date: Wed, 10 Jan 2024 16:44:41 +0800 Subject: [PATCH 33/57] integrate bias handling in ocl kernel --- .../dnn/src/layers/fully_connected_layer.cpp | 7 ------- .../src/ocl4dnn/src/ocl4dnn_inner_product.cpp | 20 +++++++++++-------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 1c27043f1a..809630188a 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -455,13 +455,6 @@ public: ret = false; break; } - - if (!use_half && bias && (outerSize > 1)) - { - UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type()); - UMat& biases = umat_blobs[1]; - cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0); - } } if (ret) return true; diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp index ee7a2c7b01..d45ff8c634 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp @@ -97,15 +97,19 @@ bool OCL4DNNInnerProduct::Forward(const UMat& bottom, max_image_size); } - if (use_half_ && bias_term_) - { - UMat biasOneMat = UMat::ones(M_, 1, CV_32F); - UMat newbias, tmpTop; + if (bias_term_) { + if (use_half_) { + UMat biasOneMat = UMat::ones(M_, 1, CV_32F); + UMat newbias, tmpTop; - convertFp16(bias, newbias); - convertFp16(top, tmpTop); - cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0); - convertFp16(tmpTop, top); + convertFp16(bias, newbias); + convertFp16(top, tmpTop); + cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0); + convertFp16(tmpTop, top); + } else { + UMat biasOnesMat = UMat::ones(M_, 1, CV_32F); + cv::gemm(biasOnesMat, bias, 1, top, 1, top, 0); + } } return ret; From e7ccff9805d240b7e86c419b57428f37ae61d199 Mon Sep 17 00:00:00 2001 From: Yuantao Feng Date: Thu, 11 Jan 2024 01:04:46 -0600 Subject: [PATCH 34/57] Merge pull request #24834 from fengyuentau:cuda_naryeltwise_broadcast dnn (cuda): support broadcasting if a.rank() != b.rank() #24834 Inspired by https://github.com/opencv/opencv/pull/24786. This PR keeps the fusion of `NaryEltwise` and `Concat` while addressed the data missing problem via supporting broadcasting if a.rank() != b.rank(). Resolves https://github.com/opencv/opencv/issues/23977 Resolves https://github.com/opencv/opencv/issues/24606 Resolves https://github.com/opencv/opencv/issues/24635 Resolves https://github.com/opencv/opencv/issues/24721 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- modules/dnn/src/cuda/eltwise_ops.cu | 44 ++++++++---- modules/dnn/src/cuda4dnn/csl/tensor.hpp | 17 +++++ .../dnn/src/layers/nary_eltwise_layers.cpp | 13 ---- modules/dnn/src/net_impl_fuse.cpp | 4 ++ modules/dnn/test/test_backends.cpp | 71 +++++++++++++++++++ modules/dnn/test/test_layers.cpp | 2 +- 6 files changed, 123 insertions(+), 28 deletions(-) diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu index 8a861b3067..e2a7cc9a67 100644 --- a/modules/dnn/src/cuda/eltwise_ops.cu +++ b/modules/dnn/src/cuda/eltwise_ops.cu @@ -132,8 +132,23 @@ void eltwise_op(const Stream& stream, TensorSpan output, TensorView x, Ten } else { - CV_Assert(is_shape_compatible(output, x)); - CV_Assert(is_shape_compatible(output, y)); + auto inShape1 = x.shape_as_vector(); + auto inShape2 = y.shape_as_vector(); + auto outShape = output.shape_as_vector(); + + std::size_t x_ndims = inShape1.size(), y_ndims = inShape2.size(); + if (x_ndims >= y_ndims) { + for (std::size_t i = 0; i < (x_ndims - y_ndims); i++) { + inShape2.insert(inShape2.begin(), 1); + } + } else { + for (std::size_t i = 0; i < (y_ndims - x_ndims); i++) { + inShape1.insert(inShape1.begin(), 1); + } + } + + CV_Assert(is_shape_compatible1(outShape, inShape1)); + CV_Assert(is_shape_compatible1(outShape, inShape2)); /* matching singleton axes in both input tensors can be eliminated * @@ -148,20 +163,21 @@ void eltwise_op(const Stream& stream, TensorSpan output, TensorView x, Ten * x: [1, 256, 32, 32] -> [256, 32, 32] * y: [1, 256, 1, 1] -> [256, 1, 1] */ - for (int r = 0; r < output.rank(); r++) - { - while (x.rank() > r && y.rank() > r && x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) { - CV_Assert(output.get_axis_size(r) == 1); - - x.squeeze(r); - y.squeeze(r); - output.squeeze(r); + int eliminate_times = 0; + for (std::size_t i = 0; i < outShape.size(); i++) { + if (inShape1[i] == 1 && inShape2[i] == 1 && outShape[i] == 1 && i != (outShape.size() - 1)) { + eliminate_times++; + } else { + break; + } + } + if (eliminate_times > 0) { + for (int i = 0; i < eliminate_times; i++) { + inShape1.erase(inShape1.begin()); + inShape2.erase(inShape2.begin()); + outShape.erase(outShape.begin()); } } - - auto inShape1 = x.shape_as_vector(); - auto inShape2 = y.shape_as_vector(); - auto outShape = output.shape_as_vector(); /* contiguous axes that do not broadcast can be merged into one axis * diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index 5a1286de99..8f495ac807 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -1187,6 +1187,23 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return true; } + template + bool is_shape_compatible1(const ShapeType &x_shape, const ShapeType &y_shape) noexcept { + const auto x_ndims = x_shape.size(), y_ndims = y_shape.size(); + + if (x_ndims != y_ndims) { + return false; + } + + for (int i = 0; i < x_ndims; i++) { + if (x_shape[i] != y_shape[i] && x_shape[i] != 1 && y_shape[i] != 1) { + return false; + } + } + + return true; + } + /** returns the rank to which the given tensor can be squeezed to */ template std::size_t get_effective_rank(const TensorType& x) noexcept { diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp index 661861cbe3..b22eb5bbf0 100644 --- a/modules/dnn/src/layers/nary_eltwise_layers.cpp +++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp @@ -818,19 +818,6 @@ public: { auto context = reinterpret_cast(context_); - auto input_0_shape = inputs[0].dynamicCast()->getShape(); - for (int i = 1; i < inputs.size(); i++) - { - auto input_i_shape = inputs[i].dynamicCast()->getShape(); - if (input_0_shape.size() != input_i_shape.size()) - return Ptr(); - // check if the shape can be supported by `eltwise_ops.cu`, or return the default BackendNode - for (int j = 0; j < input_0_shape.size(); j++) - if (input_0_shape[j] != input_i_shape[j] && - input_0_shape[j] != 1 && input_i_shape[j] != 1) - return Ptr(); - } - cuda4dnn::EltwiseOpType op_ = cuda4dnn::EltwiseOpType::SUM; switch (op) { case OPERATION::MAX: diff --git a/modules/dnn/src/net_impl_fuse.cpp b/modules/dnn/src/net_impl_fuse.cpp index dfa542bd41..b81bf14acc 100644 --- a/modules/dnn/src/net_impl_fuse.cpp +++ b/modules/dnn/src/net_impl_fuse.cpp @@ -728,6 +728,10 @@ void Net::Impl::fuseLayers(const std::vector& blobsToKeep_) if(inp_i_data->skip || inp_i_data->consumers.size() != 1) break; #ifdef HAVE_CUDA + /* Risk: Not every operation in "NaryEltwise" is supported in the CUDA backend. There is a chance + that Concat's output is filled with data in both host and device, leading to data missing. + See https://github.com/opencv/opencv/issues/24721 for more details. + */ if (preferableBackend == DNN_BACKEND_CUDA && (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false || (inp_i_data->layerInstance->type != "Convolution" && diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp index f255ab87aa..591ec63515 100644 --- a/modules/dnn/test/test_backends.cpp +++ b/modules/dnn/test/test_backends.cpp @@ -102,6 +102,12 @@ public: Net net; }; +TEST_P(DNNTestNetwork, DISABLED_YOLOv8n) { + processNet("dnn/onnx/models/yolov8n.onnx", "", Size(640, 640), "output0"); + expectNoFallbacksFromIE(net); + expectNoFallbacksFromCUDA(net); +} + TEST_P(DNNTestNetwork, AlexNet) { applyTestTag(CV_TEST_TAG_MEMORY_1GB); @@ -1518,6 +1524,71 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Eltwise, testing::Combine( dnnBackendsAndTargets() )); +//////////////////////////////////////////////////////////////////////////////// +// Element-wise layers +//////////////////////////////////////////////////////////////////////////////// +using NaryEltwiseConcat = TestWithParam, tuple>>; +TEST_P(NaryEltwiseConcat, Accuracy) { + auto param = GetParam(); + std::vector input_shape = get<0>(param); + auto backend_id = get<0>(get<1>(param)); + auto target_id = get<1>(get<1>(param)); + + /* Build the following net: + + <1x4x84> + / + [Input] -+-> Mul(B<1x84>) -> Concat(axis=1) -> [Output] + | | + +-> Sigmoid ----------+ + + */ + Net net; + + std::vector mul_B_shape(input_shape.size() - 1, 1); + mul_B_shape.back() = input_shape.back(); + Mat mul_B(mul_B_shape, CV_32FC1); + randn(mul_B, 0.f, 1.f); + LayerParams mul_B_lp; + mul_B_lp.name = "mul_B"; + mul_B_lp.type = "Const"; + mul_B_lp.blobs.push_back(mul_B); + int id_mul_B = net.addLayer(mul_B_lp.name, mul_B_lp.type, mul_B_lp); + + LayerParams mul_lp; + mul_lp.name = "mul"; + mul_lp.type = "NaryEltwise"; + mul_lp.set("operation", "mul"); + int id_mul = net.addLayer(mul_lp.name, mul_lp.type, mul_lp); + net.connect(0, 0, id_mul, 0); + net.connect(id_mul_B, 0, id_mul, 1); + + LayerParams sigmoid_lp; + sigmoid_lp.name = "sigmoid"; + sigmoid_lp.type = "Sigmoid"; + int id_sigmoid = net.addLayer(sigmoid_lp.name, sigmoid_lp.type, sigmoid_lp); + net.connect(0, 0, id_sigmoid, 0); + + LayerParams concat_lp; + concat_lp.name = "concat"; + concat_lp.type = "Concat"; + concat_lp.set("axis", 1); + int id_concat = net.addLayer(concat_lp.name, concat_lp.type, concat_lp); + net.connect(id_mul, 0, id_concat, 0); + net.connect(id_sigmoid, 0, id_concat, 1); + + // Run test + Mat input(input_shape, CV_32FC1); + testLayer(input, net, backend_id, target_id, false); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, NaryEltwiseConcat, testing::Combine( + testing::Values(std::vector{1, 4, 84}), + dnnBackendsAndTargets()) +); + + + INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_layers_backends, dnnBackendsAndTargets()); }} // namespace diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index 707ae51673..744128544b 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -2050,7 +2050,7 @@ private: net.setPreferableTarget(target); Mat re; - ASSERT_NO_THROW(re = net.forward()); // runtime error + re = net.forward(); auto ptr_re = (float *) re.data; for (int i = 0; i < re.total(); i++) if (op == "sum"){ From 6eafcbb2e19b8a65d3f757360a15f54bf01f2d48 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Wed, 10 Jan 2024 15:38:24 +0300 Subject: [PATCH 35/57] Document Android development with Maven Central package. --- .../dev_with_OCV_on_Android.markdown | 44 +++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown index 68d7ab3644..a7c06c9217 100644 --- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown +++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown @@ -18,7 +18,7 @@ This tutorial assumes you have the following installed and configured: - Android Studio - JDK - Android SDK and NDK -- OpenCV for Android SDK from official [release page on Github](https://github.com/opencv/opencv/releases) +- Optional: OpenCV for Android SDK from official [release page on Github](https://github.com/opencv/opencv/releases) or [SourceForge](https://sourceforge.net/projects/opencvlibrary/). Advanced: as alternative the SDK may be built from source code by [instruction on wiki](https://github.com/opencv/opencv/wiki/Custom-OpenCV-Android-SDK-and-AAR-package-build). @@ -26,8 +26,9 @@ If you need help with anything of the above, you may refer to our @ref tutorial_ If you encounter any error after thoroughly following these steps, feel free to contact us via OpenCV [forum](https://forum.opencv.org). We'll do our best to help you out. -Hello OpenCV sample -------------------- + +Hello OpenCV sample with SDK +---------------------------- In this section we're gonna create a simple app that does nothing but OpenCV loading. In next section we'll extend it to support camera. @@ -115,6 +116,43 @@ In addition to this instruction you can use some video guide, for example [this ![](images/run_app.png) +Hello OpenCV sample with Maven Central +-------------------------------------- + +Since OpenCV 4.9.0 OpenCV for Android package is available with Maven Central and may be installed +automatically as Gradle dependency. In this section we're gonna create a simple app that does nothing +but OpenCV loading with Maven Central. + +1. Open Android Studio and create empty project by choosing ***Empty Views Activity*** + + ![](images/create_empty_project.png) + +2. Setup the project: + - Choose ***Java*** language + - Choose ***Groovy DSL*** build configuration language + - Choose ***Minumum SDK*** with the version number not less than OpenCV supports. For 4.9.0 minimal SDK version is 21. + + ![](images/setup_project.png) + +3. Edit `build.gradle` and add OpenCV library to Dependencies list like this: + @code{.gradle} + dependencies { + implementation 'org.opencv:opencv:4.9.0' + } + @endcode + `4.9.0` may be replaced by any version available as [official release](https://central.sonatype.com/artifact/org.opencv/opencv). + +4. Before using any OpenCV function you have to load the library first. If you application includes other + OpenCV-dependent native libraries you should load them ***after*** OpenCV initialization. Add the folowing + code to load the library at app start: + @snippet samples/android/tutorial-1-camerapreview/src/org/opencv/samples/tutorial1/Tutorial1Activity.java ocv_loader_init + Like this: + ![](images/sample_code.png) + +5. Choose a device to check the sample on and run the code by pressing `run` button + + ![](images/run_app.png) + Camera view sample ------------------ From 1e190b3094fd6c5451f305e559f7ea4b3e7606a5 Mon Sep 17 00:00:00 2001 From: alexlyulkov Date: Fri, 12 Jan 2024 18:06:12 +0700 Subject: [PATCH 36/57] Merge pull request #24849 from alexlyulkov:al/aar-javadoc Modified AAR script: added javadoc to Android Maven #24849 Modified AAR script. Now the script creates 2 maven repos. The first repo contains sources jar, javadoc jar and AAR without cpp libraries. The second repo contains modified AAR with cpp libraries. The script merges two repos into one. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake Co-authored-by: Alexander Lyulkov Co-authored-by: Alexander Smorkalov --- .../aar-template/OpenCV/build.gradle.template | 37 +++++++++++++++++-- platforms/android/build_java_shared_aar.py | 13 +++++++ platforms/android/build_static_aar.py | 13 +++++++ 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/platforms/android/aar-template/OpenCV/build.gradle.template b/platforms/android/aar-template/OpenCV/build.gradle.template index 23d88a6910..10c8e64aa7 100644 --- a/platforms/android/aar-template/OpenCV/build.gradle.template +++ b/platforms/android/aar-template/OpenCV/build.gradle.template @@ -57,6 +57,7 @@ android { publishing { singleVariant('release') { withSourcesJar() + withJavadocJar() } } } @@ -64,14 +65,42 @@ android { publishing { publications { release(MavenPublication) { + // Builds aar, sources jar and javadoc jar from project sources and creates maven + groupId = 'org.opencv' + artifactId = '${PACKAGE_NAME}' + version = '${OPENCV_VERSION}' + afterEvaluate { + from components.release + } + } + modified(MavenPublication) { + // Creates maven from opencv-release.aar groupId = 'org.opencv' artifactId = '${PACKAGE_NAME}' version = '${OPENCV_VERSION}' artifact("opencv-release.aar") - -// afterEvaluate { -// from components.release -// } + pom { + name = "OpenCV" + description = "Open Source Computer Vision Library" + url = "https://opencv.org/" + licenses { + license { + name = "The Apache License, Version 2.0" + url = "https://github.com/opencv/opencv/blob/master/LICENSE" + } + } + developers { + developer { + id = "admin" + name = "OpenCV Team" + email = "admin@opencv.org" + } + } + scm { + connection = "scm:git:https://github.com/opencv/opencv.git" + url = "https://github.com/opencv/opencv" + } + } } } repositories { diff --git a/platforms/android/build_java_shared_aar.py b/platforms/android/build_java_shared_aar.py index e99c78ec28..ffb63c67e5 100755 --- a/platforms/android/build_java_shared_aar.py +++ b/platforms/android/build_java_shared_aar.py @@ -144,6 +144,8 @@ def main(args): print("Creating local maven repo...") shutil.copy(final_aar_path, path.join(ANDROID_PROJECT_DIR, "OpenCV/opencv-release.aar")) + + print("Creating a maven repo from project sources (with sources jar and javadoc jar)...") subprocess.run(["./gradlew", "publishReleasePublicationToMyrepoRepository"], shell=False, cwd=ANDROID_PROJECT_DIR, @@ -153,6 +155,17 @@ def main(args): shutil.move(path.join(ANDROID_PROJECT_DIR, "OpenCV/build/repo/org/opencv", MAVEN_PACKAGE_NAME), path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME)) + print("Creating a maven repo from modified AAR (with cpp libraries)...") + subprocess.run(["./gradlew", "publishModifiedPublicationToMyrepoRepository"], + shell=False, + cwd=ANDROID_PROJECT_DIR, + check=True) + + # Replacing AAR from the first maven repo with modified AAR from the second maven repo + shutil.copytree(path.join(ANDROID_PROJECT_DIR, "OpenCV/build/repo/org/opencv", MAVEN_PACKAGE_NAME), + path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME), + dirs_exist_ok=True) + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Builds AAR with Java and shared C++ libs from OpenCV SDK") diff --git a/platforms/android/build_static_aar.py b/platforms/android/build_static_aar.py index c1ab4046f4..20054047fa 100755 --- a/platforms/android/build_static_aar.py +++ b/platforms/android/build_static_aar.py @@ -216,6 +216,7 @@ def main(args): shutil.copy(final_aar_path, path.join(ANDROID_PROJECT_DIR, "OpenCV/opencv-release.aar")) + print("Creating a maven repo from project sources (with sources jar and javadoc jar)...") subprocess.run(["./gradlew", "publishReleasePublicationToMyrepoRepository"], shell=False, cwd=ANDROID_PROJECT_DIR, @@ -224,6 +225,18 @@ def main(args): os.makedirs(path.join(FINAL_REPO_PATH, "org/opencv"), exist_ok=True) shutil.move(path.join(ANDROID_PROJECT_DIR, "OpenCV/build/repo/org/opencv", MAVEN_PACKAGE_NAME), path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME)) + + print("Creating a maven repo from modified AAR (with cpp libraries)...") + subprocess.run(["./gradlew", "publishModifiedPublicationToMyrepoRepository"], + shell=False, + cwd=ANDROID_PROJECT_DIR, + check=True) + + # Replacing AAR from the first maven repo with modified AAR from the second maven repo + shutil.copytree(path.join(ANDROID_PROJECT_DIR, "OpenCV/build/repo/org/opencv", MAVEN_PACKAGE_NAME), + path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME), + dirs_exist_ok=True) + print("Done") From c923c5983311211c2028e28028a26987b9912721 Mon Sep 17 00:00:00 2001 From: Abduragim Shtanchaev <44877829+Abdurrahheem@users.noreply.github.com> Date: Fri, 12 Jan 2024 14:23:43 +0300 Subject: [PATCH 37/57] Merge pull request #24812 from Abdurrahheem:ash/einsum_bachedGemm Replace interactive batched Matrix Multiply. #24812 This PR replaces iterative batch matrix multiplication which `FastGemmBatch` in Einsum layer. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- .../dnn/src/layers/cpu_kernels/fast_gemm.cpp | 2 +- modules/dnn/src/layers/einsum_layer.cpp | 78 ++++--------------- 2 files changed, 14 insertions(+), 66 deletions(-) diff --git a/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp b/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp index a8972aba4e..f8fe2bb40e 100644 --- a/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp +++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp @@ -385,7 +385,7 @@ void fastGemmBatch(bool trans_a, bool trans_b, const auto shape_b = shape(B); const auto shape_c = shape(C); CV_CheckGE(shape_a.size(), static_cast(2), "DNN/fastGemmBatch: A must be n-dimensional (n >= 2)"); - CV_CheckEQ(shape_b.size(), static_cast(2), "DNN/fastGemmBatch: B must be n-dimensional (n >= 2)"); + CV_CheckGE(shape_b.size(), static_cast(2), "DNN/fastGemmBatch: B must be n-dimensional (n >= 2)"); const float *a = A.ptr(); const float *b = B.ptr(); diff --git a/modules/dnn/src/layers/einsum_layer.cpp b/modules/dnn/src/layers/einsum_layer.cpp index c7f9aaca06..d5153a5ab7 100644 --- a/modules/dnn/src/layers/einsum_layer.cpp +++ b/modules/dnn/src/layers/einsum_layer.cpp @@ -1299,7 +1299,6 @@ Mat LayerEinsumImpl::batchwiseMatMul( const Mat& input2, const MatShape& input2ShapeOverride) { - // Sanity checks before the actual MatMul CV_CheckType(input1.type(), input2.type(), "Data types of the inputs must match for MatMul"); CV_CheckEQ(input1ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul"); @@ -1312,61 +1311,22 @@ Mat LayerEinsumImpl::batchwiseMatMul( int K = input1ShapeOverride[2]; int N = input2ShapeOverride[2]; - std::vector output; + Mat reshapedInput1 = input1; + Mat reshapedInput2 = input2; + + Mat output; if (batches > 1) { - Mat reshapedInput1 = input1; - Mat reshapedInput2 = input2; + // create tmpout with type like input1 + output = Mat({batches, M, N}, input1.type()); - // input1 should of size MxK - // check if input1 needs reshape, if need reshape - if (input1.size[0] != M || input1.size[1] != K) - { - int shape[] = {batches, M, K}; - reshapedInput1 = input1.reshape(1, 3, shape); - } - - // input2 should be of size KxN - // check if input2 needs reshape, if needs reshape - if (input2.size[0] != K || input2.size[1] != N) - { - int shape[] = {batches, K, N}; - reshapedInput2 = input2.reshape(1, 3, shape); - } - - for (size_t i=0; i < batches; i++) - { - std::vector ranges1 = {cv::Range(i, i+1)}; - for (int j = 1; j < reshapedInput1.dims; j++) - ranges1.emplace_back(cv::Range::all()); - - Mat part1 = reshapedInput1(ranges1); - int shape[] = {M, K}; - part1 = part1.reshape(1, sizeof(shape)/sizeof(shape[0]), shape); - - std::vector ranges2 = {cv::Range(i, i+1)}; - for (int j = 1; j < reshapedInput2.dims; j++) - ranges2.emplace_back(cv::Range::all()); - - Mat part2 = reshapedInput2(ranges2); - int shape2[] = {K, N}; - part2 = part2.reshape(1, sizeof(shape2)/sizeof(shape2[0]), shape2); - - Mat tmp_output(M, N, part1.type()); - fastGemm(false, false, 1.0, part1, part2, 0.0, tmp_output, opt); - int newShape[] = {1, M, N}; - tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape); - - output.emplace_back(tmp_output); - } + reshapedInput2 = reshapedInput2.reshape(1, input2ShapeOverride); + reshapedInput1 = reshapedInput1.reshape(1, input1ShapeOverride); + fastGemmBatch(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, output, opt); } else { - Mat reshapedInput1 = input1; - Mat reshapedInput2 = input2; - // input1 should of size MxK - // check if input1 needs reshape, if need reshape if (input1.dims > 2 || input1.size[0] != M || input1.size[1] != K) { int shape[] = {M, K}; @@ -1374,30 +1334,18 @@ Mat LayerEinsumImpl::batchwiseMatMul( } // input2 should be of size KxN - // check if input2 needs reshape, if needs reshape if (input2.dims > 2 || input2.size[0] != K || input2.size[1] != N) { int shape2[] = {K, N}; reshapedInput2 = input2.reshape(1, 2, shape2); } - Mat tmp_output(M, N, reshapedInput1.type()); - fastGemm(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, tmp_output, opt); - - int newShape[] = {1, M, N}; - tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape); - output.emplace_back(tmp_output); + output = Mat(M, N, reshapedInput1.type()); + fastGemm(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, output, opt); + output = output.reshape(1, {1, M, N}); } - - int outputDim[] = {static_cast(output.size()), M, N}; - Mat output_buffer = Mat::zeros(3, outputDim, CV_32F); - - for (size_t i = 0; i < output.size(); i++) { - Mat output_slice = output_buffer.row(i); - output[i].copyTo(output_slice); - } - return output_buffer; + return output; }; Ptr EinsumLayer::create(const LayerParams& params) { From a7fa1e6f4b2813ffc3510f3aabfc81283b01e9d9 Mon Sep 17 00:00:00 2001 From: jimmylaw21 <95002380+jimmylaw21@users.noreply.github.com> Date: Fri, 12 Jan 2024 20:13:26 +0800 Subject: [PATCH 38/57] Merge pull request #24610 from jimmylaw21:dnn-onnx-add-group-norm-layer dnn onnx: add group norm layer #24610 dnn onnx: add group norm layer Todo: - [x] speed up by multi-threading - [x] add perf - [x] add backend: OpenVINO - [x] add backend: CUDA - [x] add backend: OpenCL (no fp16) - [ ] add backend: CANN ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake Co-authored-by: fengyuentau --- .../dnn/include/opencv2/dnn/all_layers.hpp | 5 + modules/dnn/perf/perf_layer.cpp | 62 +++++- modules/dnn/src/cuda/mvn.cu | 30 +++ modules/dnn/src/cuda4dnn/kernels/mvn.hpp | 4 + .../src/cuda4dnn/primitives/group_norm.hpp | 87 ++++++++ modules/dnn/src/init.cpp | 1 + .../dnn/src/layers/cpu_kernels/fast_norm.cpp | 47 +++++ .../dnn/src/layers/cpu_kernels/fast_norm.hpp | 3 + modules/dnn/src/layers/group_norm_layer.cpp | 190 ++++++++++++++++++ modules/dnn/src/onnx/onnx_importer.cpp | 1 + modules/dnn/src/opencl/mvn.cl | 51 +++++ modules/dnn/test/test_onnx_conformance.cpp | 2 + ...conformance_layer_filter__openvino.inl.hpp | 4 + 13 files changed, 486 insertions(+), 1 deletion(-) create mode 100644 modules/dnn/src/cuda4dnn/primitives/group_norm.hpp create mode 100644 modules/dnn/src/layers/group_norm_layer.cpp diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp index 41fe0df70f..3301f20fde 100644 --- a/modules/dnn/include/opencv2/dnn/all_layers.hpp +++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp @@ -1183,6 +1183,11 @@ CV__DNN_INLINE_NS_BEGIN static Ptr create(const LayerParams ¶ms); }; + class CV_EXPORTS GroupNormLayer : public Layer { + public: + static Ptr create(const LayerParams ¶ms); + }; + //! @} //! @} CV__DNN_INLINE_NS_END diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp index 66b5ad62c2..946e29ccb4 100644 --- a/modules/dnn/perf/perf_layer.cpp +++ b/modules/dnn/perf/perf_layer.cpp @@ -795,6 +795,66 @@ PERF_TEST_P_(Layer_Attention, VisionTransformer) { test_layer({1, 197, 768}, {768, 768, 768}, 12); } +struct Layer_GroupNorm : public TestBaseWithParam > +{ + void test_layer(const std::vector& x_shape, int num_groups) + { + int backendId = get<0>(GetParam()); + int targetId = get<1>(GetParam()); + + Mat x(x_shape, CV_32FC1); + Mat scale(x_shape[1], 1, CV_32FC1); + Mat b(x_shape[1], 1, CV_32FC1); + + randu(x, 0.f, 1.f); + randu(scale, 0.f, 1.f); + randu(b, 0.f, 1.f); + + Net net; + LayerParams lp; + lp.type = "GroupNormalization"; + lp.name = "testLayer"; + lp.set("num_groups", num_groups); + + int id = net.addLayerToPrev(lp.name, lp.type, lp); + net.connect(0, 0, id, 0); + net.connect(0, 1, id, 1); + net.connect(0, 2, id, 2); + + // warmup + { + std::vector inpNames{"x", "scale", "b"}; + net.setInputsNames(inpNames); + net.setInput(x, inpNames[0]); + net.setInput(scale, inpNames[1]); + net.setInput(b, inpNames[2]); + + net.setPreferableBackend(backendId); + net.setPreferableTarget(targetId); + Mat out = net.forward(); + } + + TEST_CYCLE() + { + Mat res = net.forward(); + } + + SANITY_CHECK_NOTHING(); + } + + int N = 2; + int C = 64; + int H = 180; + int W = 240; + int num_groups = 16; +}; + +PERF_TEST_P_(Layer_GroupNorm, GroupNorm) +{ + test_layer({N, C, H, W}, num_groups); +} + + INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false)); INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); #ifdef HAVE_CUDA @@ -807,7 +867,7 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_InstanceNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); INSTANTIATE_TEST_CASE_P(/**/, Layer_Attention, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); - +INSTANTIATE_TEST_CASE_P(/**/, Layer_GroupNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU))); typedef TestBaseWithParam > > Layer_FullyConnected; PERF_TEST_P_(Layer_FullyConnected, fc) diff --git a/modules/dnn/src/cuda/mvn.cu b/modules/dnn/src/cuda/mvn.cu index 0accc499a2..d6db7c4fb4 100644 --- a/modules/dnn/src/cuda/mvn.cu +++ b/modules/dnn/src/cuda/mvn.cu @@ -78,6 +78,18 @@ namespace raw { } } + template + __global__ void normalize_mean_variance_groupwise(Span output, View input, View scale, View bias, View means, View inv_stddev, size_type inner_size, size_type C, size_type num_groups, size_type group_size) { + for (auto idx : grid_stride_range(output.size())) { + const index_type outer_idx = idx / inner_size; + const index_type c = outer_idx % C; + const index_type group_idx = outer_idx / group_size; + auto s = static_cast(scale[c]) * inv_stddev[group_idx]; + auto b = static_cast(bias[c]); + output[idx] = (static_cast(input[idx]) - means[group_idx]) * s + b; + } + } + template __global__ void normalize_mean_variance_layernorm(Span output, View input, View scale, View means, View inv_stddev, size_type inner_size) { for (auto idx : grid_stride_range(output.size())) { @@ -191,6 +203,24 @@ template void normalize_mean_variance_channelwise(const Stream&, Span<__half> /* #endif template void normalize_mean_variance_channelwise(const Stream&, Span /*output*/, View /*input*/, View /*scale*/, View /*bias*/, View /*means*/, View /*inv_stddev*/, std::size_t, std::size_t); +template +void normalize_mean_variance_groupwise(const Stream& stream, Span output, View input, View scale, View bias, View means, View inv_stddev, std::size_t inner_size, std::size_t C, std::size_t num_groups, std::size_t group_size) +{ + CV_Assert(input.size() == output.size()); + CV_Assert(input.size() / inner_size == means.size() * group_size); + CV_Assert(means.size() == inv_stddev.size()); + + auto kernel = raw::normalize_mean_variance_groupwise; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input, scale, bias, means, inv_stddev, inner_size, C, num_groups, group_size); +} + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +template void normalize_mean_variance_groupwise(const Stream&, Span<__half> /*output*/, View<__half> /*input*/, View<__half> /*scale*/, View<__half> /*bias*/, View /*means*/, View /*inv_stddev*/, std::size_t, std::size_t, std::size_t, std::size_t); +#endif +template void normalize_mean_variance_groupwise(const Stream&, Span /*output*/, View /*input*/, View /*scale*/, View /*bias*/, View /*means*/, View /*inv_stddev*/, std::size_t, std::size_t, std::size_t, std::size_t); + + template void normalize_mean_variance_layernorm(const Stream& stream, Span output, View input, View scale, View means, View inv_stddev, std::size_t inner_size) { diff --git a/modules/dnn/src/cuda4dnn/kernels/mvn.hpp b/modules/dnn/src/cuda4dnn/kernels/mvn.hpp index 6cddeb22bb..a09dafb76d 100644 --- a/modules/dnn/src/cuda4dnn/kernels/mvn.hpp +++ b/modules/dnn/src/cuda4dnn/kernels/mvn.hpp @@ -35,6 +35,10 @@ void normalize_mean_variance_layernorm(const csl::Stream &stream, csl::Span o template void normalize_mean_variance_layernorm(const csl::Stream &stream, csl::Span output, csl::View input, csl::View scale, csl::View bias, csl::View means, csl::View inv_stddev, std::size_t inner_size); +template +void normalize_mean_variance_groupwise(const csl::Stream &stream, csl::Span output, csl::View input, csl::View scale, csl::View bias, csl::View means, csl::View inv_stddev, std::size_t inner_size, std::size_t C, std::size_t num_groups, std::size_t group_size); + + }}}} /* namespace cv::dnn::cuda4dnn::kernels */ #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/group_norm.hpp b/modules/dnn/src/cuda4dnn/primitives/group_norm.hpp new file mode 100644 index 0000000000..bb3e162a33 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/group_norm.hpp @@ -0,0 +1,87 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_GROUP_NORM_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_GROUP_NORM_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/span.hpp" +#include "../csl/tensor.hpp" +#include "../csl/workspace.hpp" + +#include "../kernels/fill_copy.hpp" +#include "../kernels/mvn.hpp" + +#include + +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class GroupNormOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + GroupNormOp(csl::Stream stream_, float epsilon_, size_t loops, size_t num_groups) + : stream(std::move(stream_)), epsilon(epsilon_), num_groups(num_groups) { + csl::WorkspaceBuilder builder; + builder.require(loops * num_groups); // mean and stdev for each group + builder.require(loops * num_groups); + scratch_mem_in_bytes = builder.required_workspace_size(); + } + + void forward(const std::vector>& inputs, + const std::vector>& outputs, + csl::Workspace& workspace) override { + auto input_wrapper = inputs[0].dynamicCast(); + auto scale_wrapper = inputs[1].dynamicCast(); + auto bias_wrapper = inputs[2].dynamicCast(); + + auto input = input_wrapper->getView(); + auto scale = scale_wrapper->getView(); + auto bias = bias_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + auto C = input.get_axis_size(1); + auto loops = input.size_range(0, 2); + auto norm_size = input.size_range(2, input.rank()); + auto num_groups = this->num_groups; + auto group_size = C / num_groups; + if (norm_size == 1) { + kernels::fill(stream, output, 0.f); + return; + } else { + auto ws_allocator = csl::WorkspaceAllocator(workspace); + + auto mean = ws_allocator.get_span(loops / group_size); + kernels::fill(stream, mean, 0.f); + + auto stdev = ws_allocator.get_span(loops / group_size); + kernels::fill(stream, stdev, 0.f); + + kernels::reduce_mean_sqr_sum(stream, mean, stdev, input, norm_size * group_size); + kernels::compute_normalization_scale(stream, stdev, mean, stdev, norm_size * group_size, epsilon); + kernels::normalize_mean_variance_groupwise(stream, output, input, scale, bias, mean, stdev, norm_size, C, num_groups, group_size); + } + } + + std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; } + + private: + csl::Stream stream; + float epsilon; + std::size_t num_groups; + std::size_t scratch_mem_in_bytes; + }; + +}}} // cv::dnn::cuda4dnn + +#endif // OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_GROUP_NORM_HPP diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp index 9b433dac50..2170aafc4b 100644 --- a/modules/dnn/src/init.cpp +++ b/modules/dnn/src/init.cpp @@ -163,6 +163,7 @@ void initializeLayerFactory() CV_DNN_REGISTER_LAYER_CLASS(Expand, ExpandLayer); CV_DNN_REGISTER_LAYER_CLASS(InstanceNormalization, InstanceNormLayer); CV_DNN_REGISTER_LAYER_CLASS(Attention, AttentionLayer); + CV_DNN_REGISTER_LAYER_CLASS(GroupNormalization, GroupNormLayer); CV_DNN_REGISTER_LAYER_CLASS(Crop, CropLayer); CV_DNN_REGISTER_LAYER_CLASS(Eltwise, EltwiseLayer); diff --git a/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp b/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp index ab9d8ee0af..35f354ed29 100644 --- a/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp +++ b/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp @@ -158,4 +158,51 @@ void fastNormChannel(const Mat &input, const Mat &scale, const Mat &bias, Mat &o parallel_for_(Range(0, loops), fn, nstripes); } +void fastNormGroup(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon, size_t num_groups) { + const auto input_shape = shape(input); + size_t N = input_shape[0], C = input_shape[1]; + CV_CheckEQ(scale.total(), bias.total(), "fastNormGroup: scale and bias should have the same shape"); + CV_CheckEQ(scale.total(), C, "fastNormGroup: scale should be a 1d tensor and match the channel of input"); + CV_CheckGE(input.dims, 3, "fastNormGroup: input dimension >= 3"); + + size_t channels_per_group = C / num_groups; + size_t loops = N * num_groups; + size_t norm_size = static_cast(total(input_shape, 2) * channels_per_group); + size_t step = norm_size / channels_per_group; + float inv_norm_size = 1.0 / norm_size; + + auto fn = [&](const Range &r) { + const auto *input_data = input.ptr(); + const auto *scale_data = scale.ptr(); + const auto *bias_data = bias.ptr(); + auto *output_data = output.ptr(); + + for (int i = r.start; i < r.end; i++) { + const auto *x = input_data + norm_size * i; + auto *y = output_data + norm_size * i; + + float mean = 0.f, mean_square = 0.f; + for (int j = 0; j < norm_size; j++) { + float v = x[j]; + mean += v; + mean_square += v * v; + } + + mean *= inv_norm_size; + mean_square = std::sqrt(std::max(0.f, mean_square * inv_norm_size - mean * mean) + epsilon); + float inv_stdev = 1.f / mean_square; + + size_t group_idx = i % num_groups * channels_per_group; + for (size_t j = 0; j < norm_size; j++) { + size_t c = group_idx + (j / step); + float s = scale_data[c] * inv_stdev, b = bias_data[c]; + y[j] = s * (x[j] - mean) + b; + } + } + }; + + double nstripes = loops * norm_size * (1 / 1024.0); + parallel_for_(Range(0, loops), fn, nstripes); +} + }} // cv::dnn diff --git a/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp b/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp index 61316542d3..72cbdad0a7 100644 --- a/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp +++ b/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp @@ -21,6 +21,9 @@ void fastNorm(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, // Channel-wise Normalization speedup by multi-threading. Scale and bias should have the same shape (C). Input should have dimension >= 3. void fastNormChannel(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon); +// Group-wise Normalization speedup by multi-threading. Scale and bias should have the same shape (C). Input should have dimension >= 3. +void fastNormGroup(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon, size_t num_groups); + }} // cv::dnn #endif // OPENCV_DNN_FAST_NORM_HPP diff --git a/modules/dnn/src/layers/group_norm_layer.cpp b/modules/dnn/src/layers/group_norm_layer.cpp new file mode 100644 index 0000000000..006e8fe7f8 --- /dev/null +++ b/modules/dnn/src/layers/group_norm_layer.cpp @@ -0,0 +1,190 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../precomp.hpp" +#include +#include "./cpu_kernels/fast_norm.hpp" + +// CUDA backend +#include "../op_cuda.hpp" +#ifdef HAVE_CUDA +#include "../cuda4dnn/primitives/group_norm.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + +// OpenCL backend +#ifdef HAVE_OPENCL +#include "../ocl4dnn/include/math_functions.hpp" +#include "opencl_kernels_dnn.hpp" +#endif + +namespace cv { +namespace dnn { + +// https://github.com/onnx/onnx/blob/main/docs/Operators.md#GroupNormalization +class GroupNormLayerImpl CV_FINAL : public GroupNormLayer { +public: + GroupNormLayerImpl(const LayerParams ¶ms) { + setParamsFrom(params); + + epsilon = params.get("epsilon", 1e-5); + num_groups = params.get("num_groups"); + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE { + return backendId == DNN_BACKEND_OPENCV || + backendId == DNN_BACKEND_CUDA; + } + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE { + const auto &input = inputs[0]; + const auto &scale = inputs[1]; + const auto &bias = inputs[2]; + CV_CheckGE(input.size(), static_cast(3), "DNN/GroupNorm: input dimension >= 3 is required"); + + int C = input[1]; + int scale_dim = std::accumulate(scale.begin(), scale.end(), 1, std::multiplies()); + CV_CheckEQ(scale_dim, C, "DNN/InstanceNorm: scale must be a 1d tensor and match the channel of input"); + int bias_dim = std::accumulate(bias.begin(), bias.end(), 1, std::multiplies()); + CV_CheckEQ(bias_dim, C, "DNN/InstanceNorm: bias must be a 1d tensor and match the channel of input"); + + outputs.assign(1, inputs[0]); + return false; + } + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + if (inputs_arr.depth() == CV_16S) { + forward_fallback(inputs_arr, outputs_arr, internals_arr); + return; + } + + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + const auto& input = inputs[0]; + const auto& scale = inputs[1]; + const auto& bias = inputs[2]; + + fastNormGroup(input, scale, bias, outputs[0], epsilon, num_groups); + } + +#ifdef HAVE_OPENCL + bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) { + std::vector inputs; + std::vector outputs; + + inputs_.getUMatVector(inputs); + outputs_.getUMatVector(outputs); + + const auto &input = inputs[0], &scale = inputs[1], &bias = inputs[2]; + auto &output = outputs[0]; + + const auto input_shape = shape(input); + size_t N = input_shape[0], C = input_shape[1]; + size_t num_groups = this->num_groups; + size_t channels_per_group = C / num_groups; + size_t loops = N * num_groups, norm_size = static_cast(total(input_shape, 2)) * channels_per_group; + float inv_norm_size = 1.f / norm_size; + + // no fp16 support + if (input.depth() == CV_16S) { + return false; + } + + String base_opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4"); + + // Calculate mean + UMat one = UMat::ones(norm_size, 1, CV_32F); + UMat mean = UMat(loops, 1, CV_32F); + UMat mean_square = UMat(loops, 1, CV_32F); + UMat tmp = UMat(loops, norm_size, CV_32F); + bool ret = ocl4dnn::ocl4dnnGEMV(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size, + input, 0, one, 0, 0.f, mean, 0); + if (!ret) { + return false; + } + // Calculate mean_square + int num_vector = (norm_size % 8 == 0) ? 8 : ((norm_size % 4 == 0) ? 4 : 1); + size_t global[] = {loops, static_cast(norm_size / num_vector)}; + String build_opt = format(" -DNUM=%d", num_vector) + base_opts; + String mean_square_kernel_name = format("calc_mean%d", num_vector); + ocl::Kernel mean_square_kernel(mean_square_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt + " -DKERNEL_MEAN"); + if (mean_square_kernel.empty()) { + return false; + } + mean_square_kernel.set(0, ocl::KernelArg::PtrReadOnly(input)); + mean_square_kernel.set(1, (int)loops); + mean_square_kernel.set(2, (int)norm_size); + mean_square_kernel.set(3, ocl::KernelArg::PtrReadOnly(mean)); + mean_square_kernel.set(4, ocl::KernelArg::PtrWriteOnly(tmp)); + ret = mean_square_kernel.run(2, global, NULL, false); + if (!ret) { + return false; + } + ret = ocl4dnn::ocl4dnnGEMV(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size, + tmp, 0, one, 0, 0.f, mean_square, 0); + if (!ret) { + return false; + } + // Calculate group norm: output = scale * (x - mean) / sqrt(var + eps) + bias + String mvn_group_kernel_name = format("mvn_group%d", num_vector); + build_opt += " -DNORM_VARIANCE -DKERNEL_MVN_GROUP"; + ocl::Kernel mvn_group_kernel(mvn_group_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt); + if (mvn_group_kernel.empty()) { + return false; + } + mvn_group_kernel.set(0, ocl::KernelArg::PtrReadOnly(input)); + mvn_group_kernel.set(1, (int)loops); + mvn_group_kernel.set(2, (int)norm_size); + mvn_group_kernel.set(3, (float)epsilon); + mvn_group_kernel.set(4, ocl::KernelArg::PtrReadOnly(mean)); + mvn_group_kernel.set(5, ocl::KernelArg::PtrReadOnly(mean_square)); + mvn_group_kernel.set(6, ocl::KernelArg::PtrReadOnly(scale)); + mvn_group_kernel.set(7, ocl::KernelArg::PtrReadOnly(bias)); + mvn_group_kernel.set(8, (int)C); + mvn_group_kernel.set(9, (int)num_groups); + mvn_group_kernel.set(10, (float)0.f); + mvn_group_kernel.set(11, ocl::KernelArg::PtrWriteOnly(output)); + ret = mvn_group_kernel.run(2, global, NULL, false); + if (!ret) { + return false; + } + + return true; + } +#endif + +#ifdef HAVE_CUDA + Ptr initCUDA(void *context_, + const std::vector>& inputs, + const std::vector>& outputs) override { + auto context = reinterpret_cast(context_); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input_shape = input_wrapper->getShape(); + size_t N = input_shape[0]; + size_t num_groups = this->num_groups; + size_t loops = N * num_groups; + + return make_cuda_node(preferableTarget, std::move(context->stream), epsilon, loops, num_groups); +} +#endif // HAVE_CUDA + +private: + float epsilon; + size_t num_groups; +}; + +Ptr GroupNormLayer::create(const LayerParams ¶ms) { + return Ptr(new GroupNormLayerImpl(params)); +} + +}} // cv::dnn diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp index f0b33d111b..a6acc6e800 100644 --- a/modules/dnn/src/onnx/onnx_importer.cpp +++ b/modules/dnn/src/onnx/onnx_importer.cpp @@ -4008,6 +4008,7 @@ void ONNXImporter::buildDispatchMap_ONNX_AI(int opset_version) dispatch["ScatterElements"] = dispatch["Scatter"] = dispatch["ScatterND"] = &ONNXImporter::parseScatter; dispatch["Tile"] = &ONNXImporter::parseTile; dispatch["LayerNormalization"] = &ONNXImporter::parseLayerNorm; + dispatch["GroupNormalization"] = &ONNXImporter::parseInstanceNormalization; dispatch["Equal"] = dispatch["Greater"] = dispatch["Less"] = dispatch["Pow"] = dispatch["Add"] = dispatch["Sub"] = dispatch["Mul"] = dispatch["Div"] = dispatch["GreaterOrEqual"] = diff --git a/modules/dnn/src/opencl/mvn.cl b/modules/dnn/src/opencl/mvn.cl index 7353ed8b82..053749b483 100644 --- a/modules/dnn/src/opencl/mvn.cl +++ b/modules/dnn/src/opencl/mvn.cl @@ -54,6 +54,7 @@ #define vec_type Dtype8 #define CALC_MEAN calc_mean8 #define MVN mvn8 + #define MVN_GROUP mvn_group8 #define MEAN_FUSE mean_fuse8 #define MVN_FUSE mvn_fuse8 #elif NUM == 4 @@ -62,6 +63,7 @@ #define vec_type Dtype4 #define CALC_MEAN calc_mean4 #define MVN mvn4 + #define MVN_GROUP mvn_group4 #define MEAN_FUSE mean_fuse4 #define MVN_FUSE mvn_fuse4 #elif NUM == 1 @@ -70,6 +72,7 @@ #define vec_type Dtype #define CALC_MEAN calc_mean1 #define MVN mvn1 + #define MVN_GROUP mvn_group1 #define MEAN_FUSE mean_fuse1 #define MVN_FUSE mvn_fuse1 #endif @@ -150,6 +153,54 @@ __kernel void MVN(__global const Dtype* src, store(dst_vec, dst, index); } +#elif defined KERNEL_MVN_GROUP + +__kernel void MVN_GROUP(__global const Dtype* src, + const int rows, + const int cols, + const Dtype eps, + __global const Dtype* mean, + __global const Dtype* dev, + __global const Dtype* weight, + __global const Dtype* bias, + const int channels, + const int num_groups, + const float relu_slope, + __global Dtype* dst) +{ + int x = get_global_id(0); + int y = get_global_id(1) * NUM; + int index = x * cols + y; + + if (x >= rows || y >= cols) + return; + + int group_size = channels / num_groups; + int step = norm_size / group_size; + int channel_index = x % num_groups * group_size + y / step + Dtype mean_val = mean[x]; + Dtype dev_val = dev[x]; + Dtype alpha; +#ifdef NORM_VARIANCE + alpha = 1 / sqrt(eps + dev_val); +#else + alpha = 1; +#endif + + Dtype w = weight[channel_index], b = bias[channel_index]; + + vec_type src_vec = load(src, index) - (vec_type)mean_val; + vec_type dst_vec = src_vec * alpha; + dst_vec = dst_vec * w + (vec_type)b; + +#ifdef FUSE_RELU + vec_type new_val = dst_vec * relu_slope; + dst_vec = select(new_val, dst_vec, dst_vec > (vec_type)0.f); +#endif + + store(dst_vec, dst, index); +} + #elif defined KERNEL_MEAN_FUSE __kernel void MEAN_FUSE(__global const T * A, diff --git a/modules/dnn/test/test_onnx_conformance.cpp b/modules/dnn/test/test_onnx_conformance.cpp index 5b783722c4..1ca3f2f75b 100644 --- a/modules/dnn/test/test_onnx_conformance.cpp +++ b/modules/dnn/test/test_onnx_conformance.cpp @@ -311,6 +311,8 @@ static const TestCase testConformanceConfig[] = { {"test_gridsample_nearest", 2, 1}, {"test_gridsample_reflection_padding", 2, 1}, {"test_gridsample_zeros_padding", 2, 1}, + {"test_group_normalization_epsilon", 3, 1}, + {"test_group_normalization_example", 3, 1}, {"test_gru_batchwise", 3, 2}, {"test_gru_defaults", 3, 1}, {"test_gru_seq_length", 4, 1}, diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp index 199bfdcd18..291ea30e92 100644 --- a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp +++ b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp @@ -736,6 +736,10 @@ CASE(test_gridsample_reflection_padding) // no filter CASE(test_gridsample_zeros_padding) // no filter +CASE(test_group_normalization_epsilon) + // no filter +CASE(test_group_normalization_example) + // no filter CASE(test_gru_batchwise) // no filter CASE(test_gru_defaults) From 2791bb70627fb009b3995da746f47ed3544df3a9 Mon Sep 17 00:00:00 2001 From: Stefan Dragnev Date: Fri, 12 Jan 2024 14:23:05 +0100 Subject: [PATCH 39/57] Merge pull request #24773 from tailsu:sd/pathlike python: accept path-like objects wherever file names are expected #24773 Merry Christmas, all :christmas_tree: Implements #15731 Support is enabled for all arguments named `filename` or `filepath` (case-insensitive), or annotated with `CV_WRAP_FILE_PATH`. Support is based on `PyOS_FSPath`, which is available in Python 3.6+. When running on older Python versions the arguments must have a `str` value as before. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- modules/core/include/opencv2/core/cvdef.h | 2 + modules/dnn/include/opencv2/dnn/dnn.hpp | 42 +++++++++---------- .../include/opencv2/objdetect/barcode.hpp | 2 +- .../include/opencv2/objdetect/face.hpp | 6 +-- modules/python/src2/cv2.hpp | 5 ++- modules/python/src2/cv2_convert.cpp | 12 ++++++ modules/python/src2/gen2.py | 5 +++ modules/python/src2/hdr_parser.py | 6 +++ modules/python/test/test_pathlike.py | 38 +++++++++++++++++ 9 files changed, 92 insertions(+), 26 deletions(-) create mode 100644 modules/python/test/test_pathlike.py diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 99cd66f8cc..b55ac3b4ba 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -475,6 +475,8 @@ Cv64suf; #define CV_WRAP_MAPPABLE(mappable) #define CV_WRAP_PHANTOM(phantom_header) #define CV_WRAP_DEFAULT(val) +/* Indicates that the function parameter has filesystem path semantic */ +#define CV_WRAP_FILE_PATH /****************************************************************************************\ * Matrix type (Mat) * diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index 3140709238..71cb01ad21 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -484,7 +484,7 @@ CV__DNN_INLINE_NS_BEGIN * Networks imported from Intel's Model Optimizer are launched in Intel's Inference Engine * backend. */ - CV_WRAP static Net readFromModelOptimizer(const String& xml, const String& bin); + CV_WRAP static Net readFromModelOptimizer(CV_WRAP_FILE_PATH const String& xml, CV_WRAP_FILE_PATH const String& bin); /** @brief Create a network from Intel's Model Optimizer in-memory buffers with intermediate representation (IR). * @param[in] bufferModelConfig buffer with model's configuration. @@ -517,7 +517,7 @@ CV__DNN_INLINE_NS_BEGIN * @param path path to output file with .dot extension * @see dump() */ - CV_WRAP void dumpToFile(const String& path); + CV_WRAP void dumpToFile(CV_WRAP_FILE_PATH const String& path); /** @brief Adds new layer to the net. * @param name unique name of the adding layer. * @param type typename of the adding layer (type must be registered in LayerRegister). @@ -890,7 +890,7 @@ CV__DNN_INLINE_NS_BEGIN * @param darknetModel path to the .weights file with learned network. * @returns Network object that ready to do forward, throw an exception in failure cases. */ - CV_EXPORTS_W Net readNetFromDarknet(const String &cfgFile, const String &darknetModel = String()); + CV_EXPORTS_W Net readNetFromDarknet(CV_WRAP_FILE_PATH const String &cfgFile, CV_WRAP_FILE_PATH const String &darknetModel = String()); /** @brief Reads a network model stored in Darknet model files. * @param bufferCfg A buffer contains a content of .cfg file with text description of the network architecture. @@ -915,7 +915,7 @@ CV__DNN_INLINE_NS_BEGIN * @param caffeModel path to the .caffemodel file with learned network. * @returns Net object. */ - CV_EXPORTS_W Net readNetFromCaffe(const String &prototxt, const String &caffeModel = String()); + CV_EXPORTS_W Net readNetFromCaffe(CV_WRAP_FILE_PATH const String &prototxt, CV_WRAP_FILE_PATH const String &caffeModel = String()); /** @brief Reads a network model stored in Caffe model in memory. * @param bufferProto buffer containing the content of the .prototxt file @@ -944,7 +944,7 @@ CV__DNN_INLINE_NS_BEGIN * let us make it more flexible. * @returns Net object. */ - CV_EXPORTS_W Net readNetFromTensorflow(const String &model, const String &config = String()); + CV_EXPORTS_W Net readNetFromTensorflow(CV_WRAP_FILE_PATH const String &model, CV_WRAP_FILE_PATH const String &config = String()); /** @brief Reads a network model stored in TensorFlow framework's format. * @param bufferModel buffer containing the content of the pb file @@ -969,7 +969,7 @@ CV__DNN_INLINE_NS_BEGIN * @param model path to the .tflite file with binary flatbuffers description of the network architecture * @returns Net object. */ - CV_EXPORTS_W Net readNetFromTFLite(const String &model); + CV_EXPORTS_W Net readNetFromTFLite(CV_WRAP_FILE_PATH const String &model); /** @brief Reads a network model stored in TFLite framework's format. * @param bufferModel buffer containing the content of the tflite file @@ -1011,7 +1011,7 @@ CV__DNN_INLINE_NS_BEGIN * * Also some equivalents of these classes from cunn, cudnn, and fbcunn may be successfully imported. */ - CV_EXPORTS_W Net readNetFromTorch(const String &model, bool isBinary = true, bool evaluate = true); + CV_EXPORTS_W Net readNetFromTorch(CV_WRAP_FILE_PATH const String &model, bool isBinary = true, bool evaluate = true); /** * @brief Read deep learning network represented in one of the supported formats. @@ -1037,7 +1037,7 @@ CV__DNN_INLINE_NS_BEGIN * @ref readNetFromTorch or @ref readNetFromDarknet. An order of @p model and @p config * arguments does not matter. */ - CV_EXPORTS_W Net readNet(const String& model, const String& config = "", const String& framework = ""); + CV_EXPORTS_W Net readNet(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "", const String& framework = ""); /** * @brief Read deep learning network represented in one of the supported formats. @@ -1064,7 +1064,7 @@ CV__DNN_INLINE_NS_BEGIN * backend. */ CV_EXPORTS_W - Net readNetFromModelOptimizer(const String &xml, const String &bin = ""); + Net readNetFromModelOptimizer(CV_WRAP_FILE_PATH const String &xml, CV_WRAP_FILE_PATH const String &bin = ""); /** @brief Load a network from Intel's Model Optimizer intermediate representation. * @param[in] bufferModelConfig Buffer contains XML configuration with network's topology. @@ -1093,7 +1093,7 @@ CV__DNN_INLINE_NS_BEGIN * @param onnxFile path to the .onnx file with text description of the network architecture. * @returns Network object that ready to do forward, throw an exception in failure cases. */ - CV_EXPORTS_W Net readNetFromONNX(const String &onnxFile); + CV_EXPORTS_W Net readNetFromONNX(CV_WRAP_FILE_PATH const String &onnxFile); /** @brief Reads a network model from ONNX * in-memory buffer. @@ -1116,7 +1116,7 @@ CV__DNN_INLINE_NS_BEGIN * @param path to the .pb file with input tensor. * @returns Mat. */ - CV_EXPORTS_W Mat readTensorFromONNX(const String& path); + CV_EXPORTS_W Mat readTensorFromONNX(CV_WRAP_FILE_PATH const String& path); /** @brief Creates 4-dimensional blob from image. Optionally resizes and crops @p image from center, * subtract @p mean values, scales values by @p scalefactor, swap Blue and Red channels. @@ -1289,7 +1289,7 @@ CV__DNN_INLINE_NS_BEGIN * is taken from NVidia's Caffe fork: https://github.com/NVIDIA/caffe. * So the resulting model may be used there. */ - CV_EXPORTS_W void shrinkCaffeModel(const String& src, const String& dst, + CV_EXPORTS_W void shrinkCaffeModel(CV_WRAP_FILE_PATH const String& src, CV_WRAP_FILE_PATH const String& dst, const std::vector& layersTypes = std::vector()); /** @brief Create a text representation for a binary network stored in protocol buffer format. @@ -1298,7 +1298,7 @@ CV__DNN_INLINE_NS_BEGIN * * @note To reduce output file size, trained weights are not included. */ - CV_EXPORTS_W void writeTextGraph(const String& model, const String& output); + CV_EXPORTS_W void writeTextGraph(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& output); /** @brief Performs non maximum suppression given boxes and corresponding scores. @@ -1403,7 +1403,7 @@ CV__DNN_INLINE_NS_BEGIN * @param[in] model Binary file contains trained weights. * @param[in] config Text file contains network configuration. */ - CV_WRAP Model(const String& model, const String& config = ""); + CV_WRAP Model(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = ""); /** * @brief Create model from deep learning network. @@ -1508,7 +1508,7 @@ CV__DNN_INLINE_NS_BEGIN * @param[in] model Binary file contains trained weights. * @param[in] config Text file contains network configuration. */ - CV_WRAP ClassificationModel(const String& model, const String& config = ""); + CV_WRAP ClassificationModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = ""); /** * @brief Create model from deep learning network. @@ -1558,7 +1558,7 @@ CV__DNN_INLINE_NS_BEGIN * @param[in] model Binary file contains trained weights. * @param[in] config Text file contains network configuration. */ - CV_WRAP KeypointsModel(const String& model, const String& config = ""); + CV_WRAP KeypointsModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = ""); /** * @brief Create model from deep learning network. @@ -1590,7 +1590,7 @@ CV__DNN_INLINE_NS_BEGIN * @param[in] model Binary file contains trained weights. * @param[in] config Text file contains network configuration. */ - CV_WRAP SegmentationModel(const String& model, const String& config = ""); + CV_WRAP SegmentationModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = ""); /** * @brief Create model from deep learning network. @@ -1621,7 +1621,7 @@ CV__DNN_INLINE_NS_BEGIN * @param[in] model Binary file contains trained weights. * @param[in] config Text file contains network configuration. */ - CV_WRAP DetectionModel(const String& model, const String& config = ""); + CV_WRAP DetectionModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = ""); /** * @brief Create model from deep learning network. @@ -1687,7 +1687,7 @@ public: * @param[in] config Text file contains network configuration */ CV_WRAP inline - TextRecognitionModel(const std::string& model, const std::string& config = "") + TextRecognitionModel(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "") : TextRecognitionModel(readNet(model, config)) { /* nothing */ } /** @@ -1842,7 +1842,7 @@ public: * @param[in] config Text file contains network configuration. */ CV_WRAP inline - TextDetectionModel_EAST(const std::string& model, const std::string& config = "") + TextDetectionModel_EAST(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "") : TextDetectionModel_EAST(readNet(model, config)) { /* nothing */ } /** @@ -1903,7 +1903,7 @@ public: * @param[in] config Text file contains network configuration. */ CV_WRAP inline - TextDetectionModel_DB(const std::string& model, const std::string& config = "") + TextDetectionModel_DB(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "") : TextDetectionModel_DB(readNet(model, config)) { /* nothing */ } CV_WRAP TextDetectionModel_DB& setBinaryThreshold(float binaryThreshold); diff --git a/modules/objdetect/include/opencv2/objdetect/barcode.hpp b/modules/objdetect/include/opencv2/objdetect/barcode.hpp index 958490a422..788889ad40 100644 --- a/modules/objdetect/include/opencv2/objdetect/barcode.hpp +++ b/modules/objdetect/include/opencv2/objdetect/barcode.hpp @@ -27,7 +27,7 @@ public: * @param prototxt_path prototxt file path for the super resolution model * @param model_path model file path for the super resolution model */ - CV_WRAP BarcodeDetector(const std::string &prototxt_path, const std::string &model_path); + CV_WRAP BarcodeDetector(CV_WRAP_FILE_PATH const std::string &prototxt_path, CV_WRAP_FILE_PATH const std::string &model_path); ~BarcodeDetector(); /** @brief Decodes barcode in image once it's found by the detect() method. diff --git a/modules/objdetect/include/opencv2/objdetect/face.hpp b/modules/objdetect/include/opencv2/objdetect/face.hpp index 9b53f83128..bfa04cbd16 100644 --- a/modules/objdetect/include/opencv2/objdetect/face.hpp +++ b/modules/objdetect/include/opencv2/objdetect/face.hpp @@ -82,8 +82,8 @@ public: * @param backend_id the id of backend * @param target_id the id of target device */ - CV_WRAP static Ptr create(const String& model, - const String& config, + CV_WRAP static Ptr create(CV_WRAP_FILE_PATH const String& model, + CV_WRAP_FILE_PATH const String& config, const Size& input_size, float score_threshold = 0.9f, float nms_threshold = 0.3f, @@ -154,7 +154,7 @@ public: * @param backend_id the id of backend * @param target_id the id of target device */ - CV_WRAP static Ptr create(const String& model, const String& config, int backend_id = 0, int target_id = 0); + CV_WRAP static Ptr create(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config, int backend_id = 0, int target_id = 0); }; //! @} diff --git a/modules/python/src2/cv2.hpp b/modules/python/src2/cv2.hpp index 2697c781ca..06080f1aa1 100644 --- a/modules/python/src2/cv2.hpp +++ b/modules/python/src2/cv2.hpp @@ -45,17 +45,20 @@ class ArgInfo private: static const uint32_t arg_outputarg_flag = 0x1; static const uint32_t arg_arithm_op_src_flag = 0x2; + static const uint32_t arg_pathlike_flag = 0x4; public: const char* name; bool outputarg; bool arithm_op_src; + bool pathlike; // more fields may be added if necessary ArgInfo(const char* name_, uint32_t arg_) : name(name_), outputarg((arg_ & arg_outputarg_flag) != 0), - arithm_op_src((arg_ & arg_arithm_op_src_flag) != 0) {} + arithm_op_src((arg_ & arg_arithm_op_src_flag) != 0), + pathlike((arg_ & arg_pathlike_flag) != 0) {} private: ArgInfo(const ArgInfo&) = delete; diff --git a/modules/python/src2/cv2_convert.cpp b/modules/python/src2/cv2_convert.cpp index 40e1608fae..c4a867892a 100644 --- a/modules/python/src2/cv2_convert.cpp +++ b/modules/python/src2/cv2_convert.cpp @@ -701,6 +701,18 @@ bool pyopencv_to(PyObject* obj, String &value, const ArgInfo& info) return true; } std::string str; + +#if ((PY_VERSION_HEX >= 0x03060000) && !defined(Py_LIMITED_API)) || (Py_LIMITED_API >= 0x03060000) + if (info.pathlike) + { + obj = PyOS_FSPath(obj); + if (PyErr_Occurred()) + { + failmsg("Expected '%s' to be a str or path-like object", info.name); + return false; + } + } +#endif if (getUnicodeString(obj, str)) { value = str; diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py index c04d661227..29a91958ee 100755 --- a/modules/python/src2/gen2.py +++ b/modules/python/src2/gen2.py @@ -500,6 +500,10 @@ class ArgInfo(object): def outputarg(self): return '/O' in self._modifiers or '/IO' in self._modifiers + @property + def pathlike(self): + return '/PATH' in self._modifiers + @property def returnarg(self): return self.outputarg @@ -523,6 +527,7 @@ class ArgInfo(object): def crepr(self): arg = 0x01 if self.outputarg else 0x0 arg += 0x02 if self.arithm_op_src_arg else 0x0 + arg += 0x04 if self.pathlike else 0x0 return "ArgInfo(\"%s\", %d)" % (self.name, arg) diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py index 0dc5dd1488..34bcd585ce 100755 --- a/modules/python/src2/hdr_parser.py +++ b/modules/python/src2/hdr_parser.py @@ -90,6 +90,10 @@ class CppHeaderParser(object): modlist.append("/IO") arg_str = arg_str.replace("CV_IN_OUT", "") + if "CV_WRAP_FILE_PATH" in arg_str: + modlist.append("/PATH") + arg_str = arg_str.replace("CV_WRAP_FILE_PATH", "") + isarray = False npos = arg_str.find("CV_CARRAY") if npos >= 0: @@ -627,6 +631,8 @@ class CppHeaderParser(object): ("noArray", arg_type)]).strip() if '/IO' in modlist and '/O' in modlist: modlist.remove('/O') + if (arg_name.lower() == 'filename' or arg_name.lower() == 'filepath') and '/PATH' not in modlist: + modlist.append('/PATH') args.append([arg_type, arg_name, defval, modlist]) npos = arg_start-1 diff --git a/modules/python/test/test_pathlike.py b/modules/python/test/test_pathlike.py new file mode 100644 index 0000000000..d654ce24ad --- /dev/null +++ b/modules/python/test/test_pathlike.py @@ -0,0 +1,38 @@ +from tests_common import NewOpenCVTests, unittest +import cv2 as cv +import os + + +def import_path(): + import sys + if sys.version_info[0] < 3 or sys.version_info[1] < 6: + raise unittest.SkipTest('Python 3.6+ required') + + from pathlib import Path + return Path + + +class CanPassPathLike(NewOpenCVTests): + def test_pathlib_path(self): + Path = import_path() + + img_path = self.find_file('cv/imgproc/stuff.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')]) + + image_from_str = cv.imread(img_path) + self.assertIsNotNone(image_from_str) + + image_from_path = cv.imread(Path(img_path)) + self.assertIsNotNone(image_from_path) + + + def test_type_mismatch(self): + import_path() # checks python version + + with self.assertRaises(TypeError) as context: + cv.imread(123) + + self.assertTrue('str or path-like' in str(context.exception)) + + +if __name__ == '__main__': + NewOpenCVTests.bootstrap() From a289eba357dcdc21d088315cbf981afe9d7bb439 Mon Sep 17 00:00:00 2001 From: Dhanwanth1803 Date: Sat, 13 Jan 2024 09:55:15 +0530 Subject: [PATCH 40/57] Fixes #24677 --- modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp index 6cf066576b..d19cec64de 100644 --- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp +++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp @@ -338,7 +338,7 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu } #if CV_TRY_AVX2 if (conv->useAVX2) - opt_AVX::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, + opt_AVX2::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); else #endif From b04de14fbbb9daae65ac76495ff2cc4f99919525 Mon Sep 17 00:00:00 2001 From: Zhuo Zhang Date: Tue, 16 Jan 2024 13:50:50 +0800 Subject: [PATCH 41/57] Fix QNX build Based on https://github.com/opencv/opencv/issues/24567 --- modules/core/src/system.cpp | 8 +++++++- modules/ts/CMakeLists.txt | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index c61fd67a19..c29b97f880 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -120,11 +120,15 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); } #include // std::abort #endif -#if defined __ANDROID__ || defined __unix__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __HAIKU__ || defined __Fuchsia__ +#if defined __ANDROID__ || defined __unix__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __HAIKU__ || defined __Fuchsia__ || defined __QNX__ # include # include #if defined __QNX__ # include +# include +using Elf64_auxv_t = auxv64_t; +# include +constexpr decltype(auto) AT_HWCAP = NT_GNU_HWCAP; #else # include #endif @@ -581,10 +585,12 @@ struct HWFeatures have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; // HWCAP_ASIMDDP have[CV_CPU_NEON_FP16] = (auxv.a_un.a_val & (1 << 10)) != 0; // HWCAP_ASIMDHP } +#if defined(AT_HWCAP2) else if (auxv.a_type == AT_HWCAP2) { have[CV_CPU_NEON_BF16] = (auxv.a_un.a_val & (1 << 14)) != 0; // HWCAP2_BF16 } +#endif } close(cpufile); diff --git a/modules/ts/CMakeLists.txt b/modules/ts/CMakeLists.txt index c1d249ea14..63edae1e67 100644 --- a/modules/ts/CMakeLists.txt +++ b/modules/ts/CMakeLists.txt @@ -47,3 +47,7 @@ if(OPENCV_DISABLE_THREAD_SUPPORT) # described in `ts_gtest.h`. ocv_target_compile_definitions(${the_module} PUBLIC GTEST_HAS_PTHREAD=0) endif() + +if(CMAKE_SYSTEM_NAME STREQUAL "QNX") + ocv_target_link_libraries(${the_module} PUBLIC regex) +endif() \ No newline at end of file From 26cf82a56c1e96073974b5ec06eab80c4af347b7 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Tue, 16 Jan 2024 09:40:52 +0300 Subject: [PATCH 42/57] Normalize axis parameter in DNN Concat to handle negative values. --- modules/dnn/src/onnx/onnx_importer.cpp | 1 + ...st_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp index a6acc6e800..5afff7db00 100644 --- a/modules/dnn/src/onnx/onnx_importer.cpp +++ b/modules/dnn/src/onnx/onnx_importer.cpp @@ -2617,6 +2617,7 @@ void ONNXImporter::parseConcat(LayerParams& layerParams, const opencv_onnx::Node // Concat-1 has default value for axis is 1: https://github.com/onnx/onnx/blob/master/docs/Changelog.md#Concat-1 int axis = layerParams.get("axis", 1); + axis = normalize_axis(axis, inputShape.size()); for (size_t i = 0; i < inputs.size(); ++i) { MatShape targetShape = inputShape; diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp index e2ea428939..0da0111990 100644 --- a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp +++ b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp @@ -41,7 +41,7 @@ "test_cast_STRING_to_FLOAT", "test_castlike_FLOAT_to_STRING_expanded", "test_castlike_STRING_to_FLOAT_expanded", -"test_concat_1d_axis_negative_1", +"test_concat_1d_axis_negative_1", // 1d support is required "test_div_uint8", // output type mismatch "test_maxpool_2d_dilations", "test_maxpool_2d_same_lower", From 1a3ef9ccd49616b2690fbf119504b7610a778206 Mon Sep 17 00:00:00 2001 From: Ingrid Wang Date: Tue, 16 Jan 2024 16:49:58 -0500 Subject: [PATCH 43/57] Replace deprecated symbols AVVideoCodecH264 and AVVideoCodecJPEG --- modules/videoio/misc/objc/ios/CvPhotoCamera2.m | 2 +- modules/videoio/misc/objc/ios/CvVideoCamera2.mm | 2 +- modules/videoio/src/cap_avfoundation_mac.mm | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/videoio/misc/objc/ios/CvPhotoCamera2.m b/modules/videoio/misc/objc/ios/CvPhotoCamera2.m index 460cce6d32..281929c558 100644 --- a/modules/videoio/misc/objc/ios/CvPhotoCamera2.m +++ b/modules/videoio/misc/objc/ios/CvPhotoCamera2.m @@ -105,7 +105,7 @@ { // setup still image output with jpeg codec self.stillImageOutput = [[AVCaptureStillImageOutput alloc] init]; - NSDictionary *outputSettings = [NSDictionary dictionaryWithObjectsAndKeys:AVVideoCodecJPEG, AVVideoCodecKey, nil]; + NSDictionary *outputSettings = [NSDictionary dictionaryWithObjectsAndKeys:AVVideoCodecTypeJPEG, AVVideoCodecKey, nil]; [self.stillImageOutput setOutputSettings:outputSettings]; [self.captureSession addOutput:self.stillImageOutput]; diff --git a/modules/videoio/misc/objc/ios/CvVideoCamera2.mm b/modules/videoio/misc/objc/ios/CvVideoCamera2.mm index 7f4abdb578..188d6c5ec7 100644 --- a/modules/videoio/misc/objc/ios/CvVideoCamera2.mm +++ b/modules/videoio/misc/objc/ios/CvVideoCamera2.mm @@ -315,7 +315,7 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;} NSDictionary *outputSettings = [NSDictionary dictionaryWithObjectsAndKeys:[NSNumber numberWithInt:self.imageWidth], AVVideoWidthKey, [NSNumber numberWithInt:self.imageHeight], AVVideoHeightKey, - AVVideoCodecH264, AVVideoCodecKey, + AVVideoCodecTypeH264, AVVideoCodecKey, nil ]; diff --git a/modules/videoio/src/cap_avfoundation_mac.mm b/modules/videoio/src/cap_avfoundation_mac.mm index c0ad4810d4..98df630c74 100644 --- a/modules/videoio/src/cap_avfoundation_mac.mm +++ b/modules/videoio/src/cap_avfoundation_mac.mm @@ -1220,13 +1220,13 @@ CvVideoWriter_AVFoundation::CvVideoWriter_AVFoundation(const std::string &filena is_good = false; } - // Three codec supported AVVideoCodecH264 AVVideoCodecJPEG AVVideoCodecTypeHEVC + // Three codec supported AVVideoCodecTypeH264 AVVideoCodecTypeJPEG AVVideoCodecTypeHEVC // On iPhone 3G H264 is not supported. if (fourcc == CV_FOURCC('J','P','E','G') || fourcc == CV_FOURCC('j','p','e','g') || fourcc == CV_FOURCC('M','J','P','G') || fourcc == CV_FOURCC('m','j','p','g')){ - codec = [AVVideoCodecJPEG copy]; // Use JPEG codec if specified, otherwise H264 + codec = [AVVideoCodecTypeJPEG copy]; // Use JPEG codec if specified, otherwise H264 }else if(fourcc == CV_FOURCC('H','2','6','4') || fourcc == CV_FOURCC('a','v','c','1')){ - codec = [AVVideoCodecH264 copy]; + codec = [AVVideoCodecTypeH264 copy]; // Available since macOS 10.13 #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED >= 101300 }else if(fourcc == CV_FOURCC('H','2','6','5') || fourcc == CV_FOURCC('h','v','c','1') || From 6c39fbc33f04c1ce7a48dbe8abd78ff256b5fdb3 Mon Sep 17 00:00:00 2001 From: _Burnside <33724737+Octopus136@users.noreply.github.com> Date: Wed, 17 Jan 2024 15:20:03 +0800 Subject: [PATCH 44/57] Merge pull request #24852 from Octopus136:4.x Make \epsilon parameter accessible in VariationalRefinement #24852 Resolves #24847 I believe this is necessary to expose \epsilon parameter. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- modules/video/include/opencv2/video/tracking.hpp | 12 ++++++++++++ modules/video/src/dis_flow.cpp | 6 ++++++ modules/video/src/variational_refinement.cpp | 2 ++ 3 files changed, 20 insertions(+) diff --git a/modules/video/include/opencv2/video/tracking.hpp b/modules/video/include/opencv2/video/tracking.hpp index 8dbcfbf216..df34a9f97c 100644 --- a/modules/video/include/opencv2/video/tracking.hpp +++ b/modules/video/include/opencv2/video/tracking.hpp @@ -564,6 +564,12 @@ public: /** @copybrief getGamma @see getGamma */ CV_WRAP virtual void setGamma(float val) = 0; + /** @brief Norm value shift for robust penalizer + @see setEpsilon */ + CV_WRAP virtual float getEpsilon() const = 0; + /** @copybrief getEpsilon @see getEpsilon */ + CV_WRAP virtual void setEpsilon(float val) = 0; + /** @brief Creates an instance of VariationalRefinement */ CV_WRAP static Ptr create(); @@ -645,6 +651,12 @@ public: /** @copybrief getVariationalRefinementGamma @see getVariationalRefinementGamma */ CV_WRAP virtual void setVariationalRefinementGamma(float val) = 0; + /** @brief Norm value shift for robust penalizer + @see setVariationalRefinementEpsilon */ + CV_WRAP virtual float getVariationalRefinementEpsilon() const = 0; + /** @copybrief getVariationalRefinementEpsilon @see getVariationalRefinementEpsilon */ + CV_WRAP virtual void setVariationalRefinementEpsilon(float val) = 0; + /** @brief Whether to use mean-normalization of patches when computing patch distance. It is turned on by default as it typically provides a noticeable quality boost because of increased robustness to diff --git a/modules/video/src/dis_flow.cpp b/modules/video/src/dis_flow.cpp index 40ac4517a4..75090d093d 100644 --- a/modules/video/src/dis_flow.cpp +++ b/modules/video/src/dis_flow.cpp @@ -67,6 +67,7 @@ class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow float variational_refinement_alpha; float variational_refinement_gamma; float variational_refinement_delta; + float variational_refinement_epsilon; bool use_mean_normalization; bool use_spatial_propagation; @@ -92,6 +93,8 @@ class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow void setVariationalRefinementDelta(float val) CV_OVERRIDE { variational_refinement_delta = val; } float getVariationalRefinementGamma() const CV_OVERRIDE { return variational_refinement_gamma; } void setVariationalRefinementGamma(float val) CV_OVERRIDE { variational_refinement_gamma = val; } + float getVariationalRefinementEpsilon() const CV_OVERRIDE { return variational_refinement_epsilon; } + void setVariationalRefinementEpsilon(float val) CV_OVERRIDE { variational_refinement_epsilon = val; } bool getUseMeanNormalization() const CV_OVERRIDE { return use_mean_normalization; } void setUseMeanNormalization(bool val) CV_OVERRIDE { use_mean_normalization = val; } @@ -219,6 +222,7 @@ DISOpticalFlowImpl::DISOpticalFlowImpl() variational_refinement_alpha = 20.f; variational_refinement_gamma = 10.f; variational_refinement_delta = 5.f; + variational_refinement_epsilon = 0.01f; border_size = 16; use_mean_normalization = true; @@ -306,6 +310,7 @@ void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_fl variational_refinement_processors[i]->setAlpha(variational_refinement_alpha); variational_refinement_processors[i]->setDelta(variational_refinement_delta); variational_refinement_processors[i]->setGamma(variational_refinement_gamma); + variational_refinement_processors[i]->setEpsilon(variational_refinement_epsilon); variational_refinement_processors[i]->setSorIterations(5); variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter); @@ -1274,6 +1279,7 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, variational_refinement_processors[i]->setAlpha(variational_refinement_alpha); variational_refinement_processors[i]->setDelta(variational_refinement_delta); variational_refinement_processors[i]->setGamma(variational_refinement_gamma); + variational_refinement_processors[i]->setEpsilon(variational_refinement_epsilon); variational_refinement_processors[i]->setSorIterations(5); variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter); diff --git a/modules/video/src/variational_refinement.cpp b/modules/video/src/variational_refinement.cpp index 968bce6717..b1891c60df 100644 --- a/modules/video/src/variational_refinement.cpp +++ b/modules/video/src/variational_refinement.cpp @@ -76,6 +76,8 @@ class VariationalRefinementImpl CV_FINAL : public VariationalRefinement void setDelta(float val) CV_OVERRIDE { delta = val; } float getGamma() const CV_OVERRIDE { return gamma; } void setGamma(float val) CV_OVERRIDE { gamma = val; } + float getEpsilon() const CV_OVERRIDE { return epsilon; } + void setEpsilon(float val) CV_OVERRIDE { epsilon = val; } protected: //!< internal buffers /* This struct defines a special data layout for Mat_. Original buffer is split into two: one for "red" From d30bf1bc3cac1ea81505f77b796644c56ff60569 Mon Sep 17 00:00:00 2001 From: Abduragim Date: Wed, 3 Jan 2024 12:42:10 +0300 Subject: [PATCH 45/57] added test for yolo nas --- modules/dnn/test/test_onnx_importer.cpp | 70 +++++++++++++++++++++---- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index 4d56cb0e17..dc8d7dc0a0 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -2666,24 +2666,36 @@ void yoloPostProcessing( cv::transposeND(outs[0], {0, 2, 1}, outs[0]); } - // each row is [cx, cy, w, h, conf_obj, conf_class1, ..., conf_class80] + if (test_name == "yolonas"){ + // outs contains 2 elemets of shape [1, 8400, 80] and [1, 8400, 4]. Concat them to get [1, 8400, 84] + Mat concat_out; + // squeeze the first dimension + outs[0] = outs[0].reshape(1, outs[0].size[1]); + outs[1] = outs[1].reshape(1, outs[1].size[1]); + cv::hconcat(outs[1], outs[0], concat_out); + outs[0] = concat_out; + // remove the second element + outs.pop_back(); + // unsqueeze the first dimension + outs[0] = outs[0].reshape(0, std::vector{1, 8400, 84}); + } + for (auto preds : outs){ preds = preds.reshape(1, preds.size[1]); // [1, 8400, 85] -> [8400, 85] - for (int i = 0; i < preds.rows; ++i) { - // filter out non objects - float obj_conf = (test_name != "yolov8") ? preds.at(i, 4) : 1.0f; + // filter out non object + float obj_conf = (test_name == "yolov8" || test_name == "yolonas") ? 1.0f : preds.at(i, 4) ; if (obj_conf < conf_threshold) continue; - Mat scores = preds.row(i).colRange((test_name != "yolov8") ? 5 : 4, preds.cols); + Mat scores = preds.row(i).colRange((test_name == "yolov8" || test_name == "yolonas") ? 4 : 5, preds.cols); double conf; Point maxLoc; minMaxLoc(scores, 0, &conf, 0, &maxLoc); - conf = (test_name != "yolov8") ? conf * obj_conf : conf; + conf = (test_name == "yolov8" || test_name == "yolonas") ? conf : conf * obj_conf; if (conf < conf_threshold) continue; @@ -2694,10 +2706,15 @@ void yoloPostProcessing( double w = det[2]; double h = det[3]; + // std::cout << "cx: " << cx << " cy: " << cy << " w: " << w << " h: " << h << " conf: " << conf << " idx: " << maxLoc.x << std::endl; // [x1, y1, x2, y2] - boxes.push_back(Rect2d(cx - 0.5 * w, cy - 0.5 * h, - cx + 0.5 * w, cy + 0.5 * h)); - classIds.push_back(maxLoc.x); + if (test_name == "yolonas"){ + boxes.push_back(Rect2d(cx, cy, w, h)); + } else { + boxes.push_back(Rect2d(cx - 0.5 * w, cy - 0.5 * h, + cx + 0.5 * w, cy + 0.5 * h)); + } + classIds.push_back(maxLoc.x); confidences.push_back(conf); } } @@ -2751,6 +2768,41 @@ TEST_P(Test_ONNX_nets, YOLOX) 1.0e-4, 1.0e-4); } +TEST_P(Test_ONNX_nets, YOLONas) +{ + // model information: https://dl.opencv.org/models/yolo-nas/Readme.md + std::string weightPath = _tf("models/yolo_nas_s.onnx", false); + + Size targetSize{640, 640}; + float conf_threshold = 0.50; + float iou_threshold = 0.50; + + std::vector refClassIds{1, 16, 7}; + std::vector refScores{0.9720f, 0.9283f, 0.8990f}; + // [x1, y1, x2, y2] + std::vector refBoxes{ + Rect2d(105.516, 173.696, 471.323, 430.433), + Rect2d(109.241, 263.406, 259.872, 531.858), + Rect2d(390.153, 142.492, 574.932, 222.709) + }; + + Image2BlobParams imgParams( + Scalar::all(1/255.0), + targetSize, + Scalar::all(0), + false, + CV_32F, + DNN_LAYOUT_NCHW, + DNN_PMODE_LETTERBOX, + Scalar::all(114) + ); + + testYOLO( + weightPath, refClassIds, refScores, refBoxes, + imgParams, conf_threshold, iou_threshold, + 1.0e-4, 1.0e-4, "yolonas"); +} + TEST_P(Test_ONNX_nets, YOLOv8) { std::string weightPath = _tf("models/yolov8n.onnx", false); From 985506c251bcf47d0665cb4019b7d87cd60cb99c Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Mon, 15 Jan 2024 10:53:14 +0300 Subject: [PATCH 46/57] Downgrade LIMITED_API_VERSION, if python3 is older than 3.6. --- cmake/OpenCVDetectPython.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/OpenCVDetectPython.cmake b/cmake/OpenCVDetectPython.cmake index 839ec1148d..a23fba6e5a 100644 --- a/cmake/OpenCVDetectPython.cmake +++ b/cmake/OpenCVDetectPython.cmake @@ -295,7 +295,12 @@ find_python("${OPENCV_PYTHON3_VERSION}" "${MIN_VER_PYTHON3}" PYTHON3_LIBRARY PYT OCV_OPTION(PYTHON3_LIMITED_API "Build with Python Limited API (not available with numpy >=1.15 <1.17)" NO VISIBLE_IF PYTHON3_NUMPY_VERSION VERSION_LESS "1.15" OR NOT PYTHON3_NUMPY_VERSION VERSION_LESS "1.17") if(PYTHON3_LIMITED_API) - set(PYTHON3_LIMITED_API_VERSION "0x03060000" CACHE STRING "Minimal Python version for Limited API") + set(_default_ver "0x03060000") + if(PYTHON3_VERSION_STRING VERSION_LESS "3.6") + # fix for older pythons + set(_default_ver "0x030${PYTHON3_VERSION_MINOR}0000") + endif() + set(PYTHON3_LIMITED_API_VERSION ${_default_ver} CACHE STRING "Minimal Python version for Limited API") endif() if(PYTHON_DEFAULT_EXECUTABLE) From dabc325cac4d60824f011c4d78fa49b378cec52d Mon Sep 17 00:00:00 2001 From: Stefan Dragnev Date: Wed, 17 Jan 2024 11:21:25 +0100 Subject: [PATCH 47/57] jpeg: use libjpeg-turbo built-in color conversions --- modules/imgcodecs/src/grfmt_jpeg.cpp | 35 ++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/modules/imgcodecs/src/grfmt_jpeg.cpp b/modules/imgcodecs/src/grfmt_jpeg.cpp index 506cebdf49..54dfea5a75 100644 --- a/modules/imgcodecs/src/grfmt_jpeg.cpp +++ b/modules/imgcodecs/src/grfmt_jpeg.cpp @@ -409,7 +409,9 @@ bool JpegDecoder::readData( Mat& img ) { jpeg_decompress_struct* cinfo = &((JpegState*)m_state)->cinfo; JpegErrorMgr* jerr = &((JpegState*)m_state)->jerr; +#ifndef JCS_EXTENSIONS JSAMPARRAY buffer = 0; +#endif if( setjmp( jerr->setjmp_buffer ) == 0 ) { @@ -429,6 +431,18 @@ bool JpegDecoder::readData( Mat& img ) } #endif +#ifdef JCS_EXTENSIONS + if( color ) + { + cinfo->out_color_space = JCS_EXT_BGR; + cinfo->out_color_components = 3; + } + else + { + cinfo->out_color_space = JCS_GRAYSCALE; + cinfo->out_color_components = 1; + } +#else if( color ) { if( cinfo->num_components != 4 ) @@ -455,6 +469,7 @@ bool JpegDecoder::readData( Mat& img ) cinfo->out_color_components = 4; } } +#endif // Check for Exif marker APP1 jpeg_saved_marker_ptr exif_marker = NULL; @@ -481,12 +496,17 @@ bool JpegDecoder::readData( Mat& img ) jpeg_start_decompress( cinfo ); +#ifndef JCS_EXTENSIONS buffer = (*cinfo->mem->alloc_sarray)((j_common_ptr)cinfo, JPOOL_IMAGE, m_width*4, 1 ); +#endif uchar* data = img.ptr(); for( ; m_height--; data += step ) { +#ifdef JCS_EXTENSIONS + jpeg_read_scanlines( cinfo, &data, 1 ); +#else jpeg_read_scanlines( cinfo, buffer, 1 ); if( color ) { @@ -502,6 +522,7 @@ bool JpegDecoder::readData( Mat& img ) else icvCvt_CMYK2Gray_8u_C4C1R( buffer[0], 0, data, 0, Size(m_width,1) ); } +#endif } result = true; @@ -593,8 +614,11 @@ bool JpegEncoder::write( const Mat& img, const std::vector& params ) int width = img.cols, height = img.rows; std::vector out_buf(1 << 12); + +#ifndef JCS_EXTENSIONS AutoBuffer _buffer; uchar* buffer; +#endif struct jpeg_compress_struct cinfo; JpegErrorMgr jerr; @@ -629,8 +653,15 @@ bool JpegEncoder::write( const Mat& img, const std::vector& params ) int _channels = img.channels(); int channels = _channels > 1 ? 3 : 1; + +#ifdef JCS_EXTENSIONS + cinfo.input_components = _channels; + cinfo.in_color_space = _channels == 3 ? JCS_EXT_BGR + : _channels == 4 ? JCS_EXT_BGRX : JCS_GRAYSCALE; +#else cinfo.input_components = channels; cinfo.in_color_space = channels > 1 ? JCS_RGB : JCS_GRAYSCALE; +#endif int quality = 95; int progressive = 0; @@ -746,14 +777,17 @@ bool JpegEncoder::write( const Mat& img, const std::vector& params ) jpeg_start_compress( &cinfo, TRUE ); +#ifndef JCS_EXTENSIONS if( channels > 1 ) _buffer.allocate(width*channels); buffer = _buffer.data(); +#endif for( int y = 0; y < height; y++ ) { uchar *data = img.data + img.step*y, *ptr = data; +#ifndef JCS_EXTENSIONS if( _channels == 3 ) { icvCvt_BGR2RGB_8u_C3R( data, 0, buffer, 0, Size(width,1) ); @@ -764,6 +798,7 @@ bool JpegEncoder::write( const Mat& img, const std::vector& params ) icvCvt_BGRA2BGR_8u_C4C3R( data, 0, buffer, 0, Size(width,1), 2 ); ptr = buffer; } +#endif jpeg_write_scanlines( &cinfo, &ptr, 1 ); } From a865fd6f00224767d397b726a452a575fe06b895 Mon Sep 17 00:00:00 2001 From: Alexander Lyulkov Date: Wed, 17 Jan 2024 10:12:27 +0300 Subject: [PATCH 48/57] Fixed AndroidManifest.xml syntax in Android camerapreview sample --- .../tutorial-1-camerapreview/gradle/AndroidManifest.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml b/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml index 7f543d2ec7..56c9ad32d6 100644 --- a/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml +++ b/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml @@ -26,13 +26,13 @@ android:largeScreens="true" android:anyDensity="true" /> - //! [camera_permissions] + - //! [camera_permissions] + From 8ba69562b516cae7f55a9d226db7ef0255aaeb52 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Wed, 17 Jan 2024 13:28:53 +0000 Subject: [PATCH 49/57] Ifdef OpenVINO API 1.0 usage in G-API module --- modules/gapi/src/backends/ie/giebackend.cpp | 4 ++-- modules/gapi/src/backends/ie/giebackend.hpp | 4 ++-- modules/gapi/src/backends/ie/giebackend/giewrapper.cpp | 4 ++-- modules/gapi/test/infer/gapi_infer_ie_test.cpp | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/gapi/src/backends/ie/giebackend.cpp b/modules/gapi/src/backends/ie/giebackend.cpp index 935f81d275..cdb246e4a2 100644 --- a/modules/gapi/src/backends/ie/giebackend.cpp +++ b/modules/gapi/src/backends/ie/giebackend.cpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018-2023 Intel Corporation +// Copyright (C) 2018-2024 Intel Corporation #include "precomp.hpp" @@ -10,7 +10,7 @@ // (cv::gapi::ie::backend() is still there and is defined always) #include "backends/ie/giebackend.hpp" -#ifdef HAVE_INF_ENGINE +#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2024000000 #if INF_ENGINE_RELEASE <= 2019010000 # error G-API IE module supports only OpenVINO IE >= 2019 R1 diff --git a/modules/gapi/src/backends/ie/giebackend.hpp b/modules/gapi/src/backends/ie/giebackend.hpp index c7d938878d..98715fc2db 100644 --- a/modules/gapi/src/backends/ie/giebackend.hpp +++ b/modules/gapi/src/backends/ie/giebackend.hpp @@ -2,7 +2,7 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018-2024 Intel Corporation #ifndef OPENCV_GAPI_GIEBACKEND_HPP #define OPENCV_GAPI_GIEBACKEND_HPP @@ -10,7 +10,7 @@ // Include anyway - cv::gapi::ie::backend() still needs to be defined #include "opencv2/gapi/infer/ie.hpp" -#ifdef HAVE_INF_ENGINE +#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2024000000 #include // type_list_index #include diff --git a/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp b/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp index a185e7b8ce..6df8187e16 100644 --- a/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp +++ b/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp @@ -2,9 +2,9 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. // -// Copyright (C) 2020 Intel Corporation +// Copyright (C) 2020-2024 Intel Corporation -#ifdef HAVE_INF_ENGINE +#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2024000000 #include #include diff --git a/modules/gapi/test/infer/gapi_infer_ie_test.cpp b/modules/gapi/test/infer/gapi_infer_ie_test.cpp index 92de39abfa..8e91d576aa 100644 --- a/modules/gapi/test/infer/gapi_infer_ie_test.cpp +++ b/modules/gapi/test/infer/gapi_infer_ie_test.cpp @@ -6,7 +6,7 @@ #include "../test_precomp.hpp" -#ifdef HAVE_INF_ENGINE +#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2024000000 #include #include From 37b02d170fc09ef0786ae101153ccbd5a43e19f5 Mon Sep 17 00:00:00 2001 From: Zhuo Zhang Date: Tue, 16 Jan 2024 19:39:30 +0800 Subject: [PATCH 50/57] fix qnx-sdp-700 build based on https://github.com/opencv/opencv/pull/24864 --- modules/core/src/system.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index c29b97f880..0a6e2f5037 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -128,7 +128,7 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); } # include using Elf64_auxv_t = auxv64_t; # include -constexpr decltype(auto) AT_HWCAP = NT_GNU_HWCAP; +const uint64_t AT_HWCAP = NT_GNU_HWCAP; #else # include #endif From 224b9ee33f9c570f3b411bd669aeec36ac751508 Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Wed, 10 Jan 2024 12:53:33 +0300 Subject: [PATCH 51/57] RISC-V: updated intrin_rvv071.hpp to work with modern toolchain 2.8.0 - intrinsics implementation (071) reworked to use modern RVV intrinsics syntax - cmake toolchain file (071) now allows selecting from predefined configurations Co-authored-by: Fang Sun --- .../include/opencv2/core/cv_cpu_dispatch.h | 2 +- .../opencv2/core/hal/intrin_rvv071.hpp | 1596 ++++++++++------- .../linux/riscv64-071-gcc.toolchain.cmake | 53 +- 3 files changed, 1033 insertions(+), 618 deletions(-) diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index de7b84b82a..8269fa6121 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -147,7 +147,7 @@ #endif #if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_vector_071) -# include +# include # define CV_RVV071 1 #endif diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp index 26f478feda..ef5f0d0ed9 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp @@ -19,7 +19,7 @@ namespace cv CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #define CV_SIMD128 1 -#define CV_SIMD128_64F 0 +#define CV_SIMD128_64F 1 //////////// Types //////////// struct v_uint8x16 { @@ -32,11 +32,11 @@ struct v_uint8x16 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) { uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; - val = (vuint8m1_t)vle_v_u8m1((unsigned char*)v, 16); + val = (vuint8m1_t)vle8_v_u8m1((unsigned char*)v, 16); } uchar get0() const { - return vmv_x_s_u8m1_u8(val, 16); + return vmv_x_s_u8m1_u8(val); } vuint8m1_t val; @@ -53,11 +53,11 @@ struct v_int8x16 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) { schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; - val = (vint8m1_t)vle_v_i8m1((schar*)v, 16); + val = (vint8m1_t)vle8_v_i8m1((schar*)v, 16); } schar get0() const { - return vmv_x_s_i8m1_i8(val, 16); + return vmv_x_s_i8m1_i8(val); } vint8m1_t val; @@ -73,11 +73,11 @@ struct v_uint16x8 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) { ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; - val = (vuint16m1_t)vle_v_u16m1((unsigned short*)v, 8); + val = (vuint16m1_t)vle16_v_u16m1((unsigned short*)v, 8); } ushort get0() const { - return vmv_x_s_u16m1_u16(val, 8); + return vmv_x_s_u16m1_u16(val); } vuint16m1_t val; @@ -93,11 +93,11 @@ struct v_int16x8 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) { short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; - val = (vint16m1_t)vle_v_i16m1((signed short*)v, 8); + val = (vint16m1_t)vle16_v_i16m1((signed short*)v, 8); } short get0() const { - return vmv_x_s_i16m1_i16(val, 8); + return vmv_x_s_i16m1_i16(val); } vint16m1_t val; @@ -113,11 +113,11 @@ struct v_uint32x4 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) { unsigned v[] = {v0, v1, v2, v3}; - val = (vuint32m1_t)vle_v_u32m1((unsigned int*)v, 4); + val = (vuint32m1_t)vle32_v_u32m1((unsigned int*)v, 4); } unsigned get0() const { - return vmv_x_s_u32m1_u32(val, 4); + return vmv_x_s_u32m1_u32(val); } vuint32m1_t val; @@ -133,11 +133,11 @@ struct v_int32x4 v_int32x4(int v0, int v1, int v2, int v3) { int v[] = {v0, v1, v2, v3}; - val = (vint32m1_t)vle_v_i32m1((signed int*)v, 4); + val = (vint32m1_t)vle32_v_i32m1((signed int*)v, 4); } int get0() const { - return vmv_x_s_i32m1_i32(val, 4); + return vmv_x_s_i32m1_i32(val); } vint32m1_t val; }; @@ -152,11 +152,11 @@ struct v_float32x4 v_float32x4(float v0, float v1, float v2, float v3) { float v[] = {v0, v1, v2, v3}; - val = (vfloat32m1_t)vle_v_f32m1((float*)v, 4); + val = (vfloat32m1_t)vle32_v_f32m1((float*)v, 4); } float get0() const { - return vfmv_f_s_f32m1_f32(val, 4); + return vfmv_f_s_f32m1_f32(val); } vfloat32m1_t val; }; @@ -171,11 +171,11 @@ struct v_uint64x2 v_uint64x2(uint64 v0, uint64 v1) { uint64 v[] = {v0, v1}; - val = (vuint64m1_t)vle_v_u64m1((unsigned long*)v, 2); + val = (vuint64m1_t)vle64_v_u64m1((unsigned long*)v, 2); } uint64 get0() const { - return vmv_x_s_u64m1_u64(val, 2); + return vmv_x_s_u64m1_u64(val); } vuint64m1_t val; }; @@ -190,11 +190,11 @@ struct v_int64x2 v_int64x2(int64 v0, int64 v1) { int64 v[] = {v0, v1}; - val = (vint64m1_t)vle_v_i64m1((long*)v, 2); + val = (vint64m1_t)vle64_v_i64m1((long*)v, 2); } int64 get0() const { - return vmv_x_s_i64m1_i64(val, 2); + return vmv_x_s_i64m1_i64(val); } vint64m1_t val; }; @@ -209,21 +209,21 @@ struct v_float64x2 v_float64x2(double v0, double v1) { double v[] = {v0, v1}; - val = (vfloat64m1_t)vle_v_f64m1((double*)v, 2); + val = (vfloat64m1_t)vle64_v_f64m1((double*)v, 2); } double get0() const { - return vfmv_f_s_f64m1_f64(val, 2); + return vfmv_f_s_f64m1_f64(val); } vfloat64m1_t val; }; - +/* #define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \ -inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \ +inline _Tp##m1_t vreinterpret_v_##suffix##m1_##suffix##m1(_Tp##m1_t v) { return v; } \ inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \ inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \ inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \ -inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \ +inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); } \ inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \ inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \ inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \ @@ -233,17 +233,128 @@ inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2( OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8) -OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8) +OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, i8) OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16) -OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16) +OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, i16) OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32) -OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32) +OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, i32) OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64) -OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64) +OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, i64) OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64) OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32) +*/ +inline v_uint8x16 v_reinterpret_as_u8(const v_uint8x16& v) { return v_uint8x16(v.val); } +inline v_int8x16 v_reinterpret_as_s8(const v_uint8x16& v) { return v_int8x16(vreinterpret_v_u8m1_i8m1(v.val)); } +inline v_uint16x8 v_reinterpret_as_u16(const v_uint8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(v.val)); } +inline v_int16x8 v_reinterpret_as_s16(const v_uint8x16& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(vreinterpret_v_u8m1_u16m1(v.val))); } +inline v_uint32x4 v_reinterpret_as_u32(const v_uint8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(v.val)); } +inline v_int32x4 v_reinterpret_as_s32(const v_uint8x16& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u8m1_u32m1(v.val))); } +inline v_uint64x2 v_reinterpret_as_u64(const v_uint8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(v.val)); } +inline v_int64x2 v_reinterpret_as_s64(const v_uint8x16& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u8m1_u64m1(v.val))); } +inline v_float32x4 v_reinterpret_as_f32(const v_uint8x16& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u8m1_u32m1(v.val))); } +inline v_float64x2 v_reinterpret_as_f64(const v_uint8x16& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u8m1_u64m1(v.val))); } + +inline v_uint8x16 v_reinterpret_as_u8(const v_int8x16& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(v.val)); } +inline v_int8x16 v_reinterpret_as_s8(const v_int8x16& v) { return v_int8x16(v.val); } +inline v_uint16x8 v_reinterpret_as_u16(const v_int8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(vreinterpret_v_i8m1_u8m1(v.val))); } +inline v_int16x8 v_reinterpret_as_s16(const v_int8x16& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); } +inline v_uint32x4 v_reinterpret_as_u32(const v_int8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(vreinterpret_v_i8m1_u8m1(v.val))); } +inline v_int32x4 v_reinterpret_as_s32(const v_int8x16& v) { return v_int32x4(vreinterpret_v_i8m1_i32m1(v.val)); } +inline v_uint64x2 v_reinterpret_as_u64(const v_int8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(vreinterpret_v_i8m1_u8m1(v.val))); } +inline v_int64x2 v_reinterpret_as_s64(const v_int8x16& v) { return v_int64x2(vreinterpret_v_i8m1_i64m1(v.val)); } +inline v_float32x4 v_reinterpret_as_f32(const v_int8x16& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i8m1_i32m1(v.val))); } +inline v_float64x2 v_reinterpret_as_f64(const v_int8x16& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i8m1_i64m1(v.val))); } + +inline v_uint8x16 v_reinterpret_as_u8(const v_uint16x8& v) { return v_uint8x16(vreinterpret_v_u16m1_u8m1(v.val)); } +inline v_int8x16 v_reinterpret_as_s8(const v_uint16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(vreinterpret_v_u16m1_i16m1(v.val))); } +inline v_uint16x8 v_reinterpret_as_u16(const v_uint16x8& v) { return v_uint16x8(v.val); } +inline v_int16x8 v_reinterpret_as_s16(const v_uint16x8& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(v.val)); } +inline v_uint32x4 v_reinterpret_as_u32(const v_uint16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(v.val)); } +inline v_int32x4 v_reinterpret_as_s32(const v_uint16x8& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u16m1_u32m1(v.val))); } +inline v_uint64x2 v_reinterpret_as_u64(const v_uint16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(v.val)); } +inline v_int64x2 v_reinterpret_as_s64(const v_uint16x8& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u16m1_u64m1(v.val))); } +inline v_float32x4 v_reinterpret_as_f32(const v_uint16x8& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u16m1_u32m1(v.val))); } +inline v_float64x2 v_reinterpret_as_f64(const v_uint16x8& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u16m1_u64m1(v.val))); } + +inline v_uint8x16 v_reinterpret_as_u8(const v_int16x8& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(v.val))); } +inline v_int8x16 v_reinterpret_as_s8(const v_int16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(v.val)); } +inline v_uint16x8 v_reinterpret_as_u16(const v_int16x8& v) { return v_uint16x8(vreinterpret_v_i16m1_u16m1(v.val)); } +inline v_int16x8 v_reinterpret_as_s16(const v_int16x8& v) { return v_int16x8(v.val); } +inline v_uint32x4 v_reinterpret_as_u32(const v_int16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(vreinterpret_v_i16m1_u16m1(v.val))); } +inline v_int32x4 v_reinterpret_as_s32(const v_int16x8& v) { return v_int32x4(vreinterpret_v_i16m1_i32m1(v.val)); } +inline v_uint64x2 v_reinterpret_as_u64(const v_int16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(vreinterpret_v_i16m1_u16m1(v.val))); } +inline v_int64x2 v_reinterpret_as_s64(const v_int16x8& v) { return v_int64x2(vreinterpret_v_i16m1_i64m1(v.val)); } +inline v_float32x4 v_reinterpret_as_f32(const v_int16x8& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i16m1_i32m1(v.val))); } +inline v_float64x2 v_reinterpret_as_f64(const v_int16x8& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i16m1_i64m1(v.val))); } + +inline v_uint8x16 v_reinterpret_as_u8(const v_uint32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(v.val)); } +inline v_int8x16 v_reinterpret_as_s8(const v_uint32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_u32m1_i32m1(v.val))); } +inline v_uint16x8 v_reinterpret_as_u16(const v_uint32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(v.val)); } +inline v_int16x8 v_reinterpret_as_s16(const v_uint32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_u32m1_i32m1(v.val))); } +inline v_uint32x4 v_reinterpret_as_u32(const v_uint32x4& v) { return v_uint32x4(v.val); } +inline v_int32x4 v_reinterpret_as_s32(const v_uint32x4& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(v.val)); } +inline v_uint64x2 v_reinterpret_as_u64(const v_uint32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(v.val)); } +inline v_int64x2 v_reinterpret_as_s64(const v_uint32x4& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u32m1_u64m1(v.val))); } +inline v_float32x4 v_reinterpret_as_f32(const v_uint32x4& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(v.val)); } +inline v_float64x2 v_reinterpret_as_f64(const v_uint32x4& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(v.val))); } + +inline v_uint8x16 v_reinterpret_as_u8(const v_int32x4& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(v.val))); } +inline v_int8x16 v_reinterpret_as_s8(const v_int32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(v.val)); } +inline v_uint16x8 v_reinterpret_as_u16(const v_int32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_i32m1_u32m1(v.val))); } +inline v_int16x8 v_reinterpret_as_s16(const v_int32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(v.val)); } +inline v_uint32x4 v_reinterpret_as_u32(const v_int32x4& v) { return v_uint32x4(vreinterpret_v_i32m1_u32m1(v.val)); } +inline v_int32x4 v_reinterpret_as_s32(const v_int32x4& v) { return v_int32x4(v.val); } +inline v_uint64x2 v_reinterpret_as_u64(const v_int32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_i32m1_u32m1(v.val))); } +inline v_int64x2 v_reinterpret_as_s64(const v_int32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(v.val)); } +inline v_float32x4 v_reinterpret_as_f32(const v_int32x4& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(v.val)); } +inline v_float64x2 v_reinterpret_as_f64(const v_int32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(v.val))); } + +inline v_uint8x16 v_reinterpret_as_u8(const v_uint64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(v.val)); } +inline v_int8x16 v_reinterpret_as_s8(const v_uint64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_u64m1_i64m1(v.val))); } +inline v_uint16x8 v_reinterpret_as_u16(const v_uint64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(v.val)); } +inline v_int16x8 v_reinterpret_as_s16(const v_uint64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_u64m1_i64m1(v.val))); } +inline v_uint32x4 v_reinterpret_as_u32(const v_uint64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(v.val)); } +inline v_int32x4 v_reinterpret_as_s32(const v_uint64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_u64m1_i64m1(v.val))); } +inline v_uint64x2 v_reinterpret_as_u64(const v_uint64x2& v) { return v_uint64x2(v.val); } +inline v_int64x2 v_reinterpret_as_s64(const v_uint64x2& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(v.val)); } +inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(v.val))); } +inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(v.val)); } + +inline v_uint8x16 v_reinterpret_as_u8(const v_int64x2& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(v.val))); } +inline v_int8x16 v_reinterpret_as_s8(const v_int64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(v.val)); } +inline v_uint16x8 v_reinterpret_as_u16(const v_int64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_i64m1_u64m1(v.val))); } +inline v_int16x8 v_reinterpret_as_s16(const v_int64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(v.val)); } +inline v_uint32x4 v_reinterpret_as_u32(const v_int64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_i64m1_u64m1(v.val))); } +inline v_int32x4 v_reinterpret_as_s32(const v_int64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(v.val)); } +inline v_uint64x2 v_reinterpret_as_u64(const v_int64x2& v) { return v_uint64x2(vreinterpret_v_i64m1_u64m1(v.val)); } +inline v_int64x2 v_reinterpret_as_s64(const v_int64x2& v) { return v_int64x2(v.val); } +inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(v.val))); } +inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(v.val)); } + +inline v_uint8x16 v_reinterpret_as_u8(const v_float32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(vreinterpret_v_f32m1_u32m1(v.val))); } +inline v_int8x16 v_reinterpret_as_s8(const v_float32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_f32m1_i32m1(v.val))); } +inline v_uint16x8 v_reinterpret_as_u16(const v_float32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_f32m1_u32m1(v.val))); } +inline v_int16x8 v_reinterpret_as_s16(const v_float32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_f32m1_i32m1(v.val))); } +inline v_uint32x4 v_reinterpret_as_u32(const v_float32x4& v) { return v_uint32x4(vreinterpret_v_f32m1_u32m1(v.val)); } +inline v_int32x4 v_reinterpret_as_s32(const v_float32x4& v) { return v_int32x4(vreinterpret_v_f32m1_i32m1(v.val)); } +inline v_uint64x2 v_reinterpret_as_u64(const v_float32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v.val))); } +inline v_int64x2 v_reinterpret_as_s64(const v_float32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val))); } +inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& v) { return v_float32x4(v.val); } +inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val)))); } + +inline v_uint8x16 v_reinterpret_as_u8(const v_float64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(vreinterpret_v_f64m1_u64m1(v.val))); } +inline v_int8x16 v_reinterpret_as_s8(const v_float64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_f64m1_i64m1(v.val))); } +inline v_uint16x8 v_reinterpret_as_u16(const v_float64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_f64m1_u64m1(v.val))); } +inline v_int16x8 v_reinterpret_as_s16(const v_float64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_f64m1_i64m1(v.val))); } +inline v_uint32x4 v_reinterpret_as_u32(const v_float64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v.val))); } +inline v_int32x4 v_reinterpret_as_s32(const v_float64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val))); } +inline v_uint64x2 v_reinterpret_as_u64(const v_float64x2& v) { return v_uint64x2(vreinterpret_v_f64m1_u64m1(v.val)); } +inline v_int64x2 v_reinterpret_as_s64(const v_float64x2& v) { return v_int64x2(vreinterpret_v_f64m1_i64m1(v.val)); } +inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val)))); } +inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64x2(v.val); } + #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \ -inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); } \ +inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); } \ inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); } OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16) @@ -254,7 +365,7 @@ OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4) OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4) OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2) OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2) -inline v_float32x4 v_setzero_f32() { return v_float32x4((vfloat32m1_t){0}); } +inline v_float32x4 v_setzero_f32() { return v_float32x4(vfmv_v_f_f32m1(0, 4)); } inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); } inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); } @@ -297,8 +408,8 @@ OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4) OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4) OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4) OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vsadd_vv_i64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vssub_vv_i64m1, 2) OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2) OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2) OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4) @@ -401,10 +512,10 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3) { - vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0); - res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1); - res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1); - res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1); + vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0); + res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1); + res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1); + res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 3, 4), m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1); return v_float32x4(res); } @@ -412,9 +523,9 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& a) { - vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0); - res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1); - res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1); + vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0); + res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1); + res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1); res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1); return v_float32x4(res); } @@ -471,11 +582,11 @@ OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2, i64m1, 2) #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \ inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ { \ - return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \ + return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \ } \ inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ { \ - a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \ + a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \ return a; \ } @@ -485,17 +596,17 @@ OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1) inline v_float32x4 operator ~ (const v_float32x4& a) { - return v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4))); + return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4))); } #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \ inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \ { \ - return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \ + return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \ } \ inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \ { \ - a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \ + a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \ return a; \ } @@ -505,7 +616,7 @@ OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1) inline v_float64x2 operator ~ (const v_float64x2& a) { - return v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2))); + return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2))); } inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { @@ -527,19 +638,19 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) inline v_uint32x4 v_abs(v_int32x4 x) { vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4); - return v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4)); + return v_uint32x4(vreinterpret_v_i32m1_u32m1(vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4))); } inline v_uint16x8 v_abs(v_int16x8 x) { vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8); - return v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8)); + return v_uint16x8(vreinterpret_v_i16m1_u16m1(vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8))); } inline v_uint8x16 v_abs(v_int8x16 x) { vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16); - return v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16)); + return v_uint8x16(vreinterpret_v_i8m1_u8m1(vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16))); } inline v_float32x4 v_abs(v_float32x4 x) @@ -591,7 +702,7 @@ inline v_int16x8 v_absdiffs(v_int16x8 a, v_int16x8 b){ inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){ \ vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\ vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\ - return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num)); \ + return v_uint##_Tpvec(vreinterpret_v_i##_Tpv##_u##_Tpv(vsub_vv_i##_Tpv(max, min, num))); \ } OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16) @@ -604,8 +715,8 @@ inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b, { vint16m2_t res = vundefined_i16m2(); res = vwmul_vv_i16m2(a.val, b.val, 16); - c.val = vget_i16m2_i16m1(res, 0); - d.val = vget_i16m2_i16m1(res, 1); + c.val = vget_v_i16m2_i16m1(res, 0); + d.val = vget_v_i16m2_i16m1(res, 1); } inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b, @@ -613,8 +724,8 @@ inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b, { vuint16m2_t res = vundefined_u16m2(); res = vwmulu_vv_u16m2(a.val, b.val, 16); - c.val = vget_u16m2_u16m1(res, 0); - d.val = vget_u16m2_u16m1(res, 1); + c.val = vget_v_u16m2_u16m1(res, 0); + d.val = vget_v_u16m2_u16m1(res, 1); } inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, @@ -622,8 +733,8 @@ inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, { vint32m2_t res = vundefined_i32m2(); res = vwmul_vv_i32m2(a.val, b.val, 8); - c.val = vget_i32m2_i32m1(res, 0); - d.val = vget_i32m2_i32m1(res, 1); + c.val = vget_v_i32m2_i32m1(res, 0); + d.val = vget_v_i32m2_i32m1(res, 1); } inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, @@ -631,8 +742,8 @@ inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, { vuint32m2_t res = vundefined_u32m2(); res = vwmulu_vv_u32m2(a.val, b.val, 8); - c.val = vget_u32m2_u32m1(res, 0); - d.val = vget_u32m2_u32m1(res, 1); + c.val = vget_v_u32m2_u32m1(res, 0); + d.val = vget_v_u32m2_u32m1(res, 1); } inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b, @@ -640,8 +751,8 @@ inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b, { vint64m2_t res = vundefined_i64m2(); res = vwmul_vv_i64m2(a.val, b.val, 4); - c.val = vget_i64m2_i64m1(res, 0); - d.val = vget_i64m2_i64m1(res, 1); + c.val = vget_v_i64m2_i64m1(res, 0); + d.val = vget_v_i64m2_i64m1(res, 1); } inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, @@ -649,8 +760,8 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, { vuint64m2_t res = vundefined_u64m2(); res = vwmulu_vv_u64m2(a.val, b.val, 4); - c.val = vget_u64m2_u64m1(res, 0); - d.val = vget_u64m2_u64m1(res, 1); + c.val = vget_v_u64m2_u64m1(res, 0); + d.val = vget_v_u64m2_u64m1(res, 1); } OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16) @@ -669,118 +780,202 @@ OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8) // 16 >> 32 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) { + vuint32m2_t vindex = vundefined_u32m2(); + vuint32m1_t vindex0 = vid_v_u32m1(4); + vindex0 = vsll_vx_u32m1(vindex0, 1, 4); + vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0); + vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4)); vint32m2_t res = vundefined_i32m2(); res = vwmul_vv_i32m2(a.val, b.val, 8); - res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8); - return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4)); + res = vrgather_vv_i32m2(res, vindex, 8); + return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0), vget_v_i32m2_i32m1(res, 1), 4)); } inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) { + vuint32m2_t vindex = vundefined_u32m2(); + vuint32m1_t vindex0 = vid_v_u32m1(4); + vindex0 = vsll_vx_u32m1(vindex0, 1, 4); + vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0); + vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4)); vint32m2_t res = vundefined_i32m2(); res = vwmul_vv_i32m2(a.val, b.val, 8); - res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8); - return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4)); + res = vrgather_vv_i32m2(res, vindex, 8); + return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0),vget_v_i32m2_i32m1(res, 1), 4), c.val, 4)); } // 32 >> 64 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) { + vuint64m2_t vindex = vundefined_u64m2(); + vuint64m1_t vindex0 = vid_v_u64m1(2); + vindex0 = vsll_vx_u64m1(vindex0, 1, 2); + vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0); + vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2)); vint64m2_t res = vundefined_i64m2(); res = vwmul_vv_i64m2(a.val, b.val, 4); - res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4); - return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2)); + res = vrgather_vv_i64m2(res, vindex, 4); + return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2)); } inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) { + vuint64m2_t vindex = vundefined_u64m2(); + vuint64m1_t vindex0 = vid_v_u64m1(2); + vindex0 = vsll_vx_u64m1(vindex0, 1, 2); + vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0); + vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2)); vint64m2_t res = vundefined_i64m2(); res = vwmul_vv_i64m2(a.val, b.val, 4); - res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4); - return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2)); + res = vrgather_vv_i64m2(res, vindex, 4); + return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2), c.val, 2)); } // 8 >> 32 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) { + vuint32m4_t vindex32 = vundefined_u32m4(); + vuint32m1_t vindex0 = vid_v_u32m1(4); + vindex0 = vsll_vx_u32m1(vindex0, 2, 4); + vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0); + vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4)); + vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4)); + vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4)); + vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16); vuint16m2_t v1 = vundefined_u16m2(); vuint32m2_t v2 = vundefined_u32m2(); v1 = vwmulu_vv_u16m2(a.val, b.val, 16); - v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); - v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8); - return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4)); + v1 = vrgather_vv_u16m2(v1, vindex, 16); + v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8); + return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4)); } inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) { + vuint32m4_t vindex32 = vundefined_u32m4(); + vuint32m1_t vindex0 = vid_v_u32m1(4); + vindex0 = vsll_vx_u32m1(vindex0, 2, 4); + vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0); + vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4)); + vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4)); + vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4)); + vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16); vuint16m2_t v1 = vundefined_u16m2(); vuint32m2_t v2 = vundefined_u32m2(); v1 = vwmulu_vv_u16m2(a.val, b.val, 16); - v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); - v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8); - return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4)); + v1 = vrgather_vv_u16m2(v1, vindex, 16); + v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8); + return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4)); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) { + vuint32m4_t vindex32 = vundefined_u32m4(); + vuint32m1_t vindex0 = vid_v_u32m1(4); + vindex0 = vsll_vx_u32m1(vindex0, 2, 4); + vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0); + vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4)); + vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4)); + vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4)); + vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16); vint16m2_t v1 = vundefined_i16m2(); vint32m2_t v2 = vundefined_i32m2(); v1 = vwmul_vv_i16m2(a.val, b.val, 16); - v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); - v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8); - return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4)); + v1 = vrgather_vv_i16m2(v1, vindex, 16); + v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8); + return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4)); } inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) { + vuint32m4_t vindex32 = vundefined_u32m4(); + vuint32m1_t vindex0 = vid_v_u32m1(4); + vindex0 = vsll_vx_u32m1(vindex0, 2, 4); + vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0); + vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4)); + vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4)); + vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4)); + vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16); vint16m2_t v1 = vundefined_i16m2(); vint32m2_t v2 = vundefined_i32m2(); v1 = vwmul_vv_i16m2(a.val, b.val, 16); - v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); - v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8); - return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4)); + v1 = vrgather_vv_i16m2(v1, vindex, 16); + v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8); + return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4)); } inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) { + vuint64m4_t vindex64 = vundefined_u64m4(); + vuint64m1_t vindex0 = vid_v_u64m1(2); + vindex0 = vsll_vx_u64m1(vindex0, 2, 2); + vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0); + vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2)); + vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2)); + vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2)); + vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8); vuint32m2_t v1 = vundefined_u32m2(); vuint64m2_t v2 = vundefined_u64m2(); v1 = vwmulu_vv_u32m2(a.val, b.val, 8); - v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8); - v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4); - return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2)); + v1 = vrgather_vv_u32m2(v1, vindex, 8); + v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4); + return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2)); } inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) { + vuint64m4_t vindex64 = vundefined_u64m4(); + vuint64m1_t vindex0 = vid_v_u64m1(2); + vindex0 = vsll_vx_u64m1(vindex0, 2, 2); + vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0); + vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2)); + vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2)); + vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2)); + vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8); vuint32m2_t v1 = vundefined_u32m2(); vuint64m2_t v2 = vundefined_u64m2(); v1 = vwmulu_vv_u32m2(a.val, b.val, 8); - v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8); - v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4); - return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2)); + v1 = vrgather_vv_u32m2(v1, vindex, 8); + v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4); + return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2)); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) { + vuint64m4_t vindex64 = vundefined_u64m4(); + vuint64m1_t vindex0 = vid_v_u64m1(2); + vindex0 = vsll_vx_u64m1(vindex0, 2, 2); + vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0); + vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2)); + vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2)); + vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2)); + vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8); vint32m2_t v1 = vundefined_i32m2(); vint64m2_t v2 = vundefined_i64m2(); v1 = vwmul_vv_i32m2(a.val, b.val, 8); - v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8); - v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4); - return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2)); + v1 = vrgather_vv_i32m2(v1, vindex, 8); + v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4); + return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2)); } inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) { + vuint64m4_t vindex64 = vundefined_u64m4(); + vuint64m1_t vindex0 = vid_v_u64m1(2); + vindex0 = vsll_vx_u64m1(vindex0, 2, 2); + vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0); + vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2)); + vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2)); + vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2)); + vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8); vint32m2_t v1 = vundefined_i32m2(); vint64m2_t v2 = vundefined_i64m2(); v1 = vwmul_vv_i32m2(a.val, b.val, 8); - v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8); - v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4); - return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2)); + v1 = vrgather_vv_i32m2(v1, vindex, 8); + v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4); + return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2)); } //////// Fast Dot Product //////// @@ -789,14 +984,14 @@ inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) { vint32m2_t v1 = vundefined_i32m2(); v1 = vwmul_vv_i32m2(a.val, b.val, 8); - return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4)); + return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4)); } inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) { vint32m2_t v1 = vundefined_i32m2(); v1 = vwmul_vv_i32m2(a.val, b.val, 8); - return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4)); + return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4), c.val, 4)); } // 32 >> 64 @@ -804,13 +999,13 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) { vint64m2_t v1 = vundefined_i64m2(); v1 = vwmul_vv_i64m2(a.val, b.val, 4); - return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2)); + return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2)); } inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) { vint64m2_t v1 = vundefined_i64m2(); v1 = vwmul_vv_i64m2(a.val, b.val, 8); - return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4)); + return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 4), c.val, 4)); } // 8 >> 32 @@ -819,8 +1014,8 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b vuint16m2_t v1 = vundefined_u16m2(); vuint32m2_t v2 = vundefined_u32m2(); v1 = vwmulu_vv_u16m2(a.val, b.val, 16); - v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8); - return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4)); + v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8); + return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4)); } inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) @@ -828,8 +1023,8 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b vuint16m2_t v1 = vundefined_u16m2(); vuint32m2_t v2 = vundefined_u32m2(); v1 = vwmulu_vv_u16m2(a.val, b.val, 16); - v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8); - return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4)); + v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8); + return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4)); } inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) @@ -837,16 +1032,16 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) vint16m2_t v1 = vundefined_i16m2(); vint32m2_t v2 = vundefined_i32m2(); v1 = vwmul_vv_i16m2(a.val, b.val, 16); - v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8); - return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4)); + v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8); + return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4)); } inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) { vint16m2_t v1 = vundefined_i16m2(); vint32m2_t v2 = vundefined_i32m2(); v1 = vwmul_vv_i16m2(a.val, b.val, 16); - v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8); - return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4)); + v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8); + return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4)); } // 16 >> 64 @@ -855,16 +1050,16 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b vuint32m2_t v1 = vundefined_u32m2(); vuint64m2_t v2 = vundefined_u64m2(); v1 = vwmulu_vv_u32m2(a.val, b.val, 8); - v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4); - return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2)); + v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4); + return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2)); } inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) { vuint32m2_t v1 = vundefined_u32m2(); vuint64m2_t v2 = vundefined_u64m2(); v1 = vwmulu_vv_u32m2(a.val, b.val, 8); - v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4); - return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2)); + v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4); + return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2)); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) @@ -872,16 +1067,16 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) vint32m2_t v1 = vundefined_i32m2(); vint64m2_t v2 = vundefined_i64m2(); v1 = vwmul_vv_i32m2(a.val, b.val, 8); - v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4); - return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2)); + v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4); + return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2)); } inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) { vint32m2_t v1 = vundefined_i32m2(); vint64m2_t v2 = vundefined_i64m2(); v1 = vwmul_vv_i32m2(a.val, b.val, 8); - v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4); - return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2)); + v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4); + return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2)); } @@ -890,16 +1085,16 @@ inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \ {\ v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \ val = intrin(val, a.val, val, num); \ - return vmv_x_s_##len##m1_##len(val, num); \ + return vmv_x_s_##len##m1_##len(val); \ } -#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \ +#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num, scalerfunc) \ inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \ {\ - v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \ + v##_Tpvec##m1_t val = vundefined_##_Tpvec2##m1(); \ val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num); \ - return val[0]; \ + return scalerfunc(val); \ } OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16) OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8) @@ -910,30 +1105,30 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu inline float v_reduce_sum(const v_float32x4& a) \ {\ vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \ - val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4); \ - return vfmv_f_s_f32m1_f32(val, 4); \ + val = vfredosum_vs_f32m1_f32m1(val, a.val, val, 4); \ + return vfmv_f_s_f32m1_f32(val); \ } inline double v_reduce_sum(const v_float64x2& a) \ {\ vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \ - val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2); \ - return vfmv_f_s_f64m1_f64(val, 2); \ + val = vfredosum_vs_f64m1_f64m1(val, a.val, val, 2); \ + return vfmv_f_s_f64m1_f64(val); \ } inline uint64 v_reduce_sum(const v_uint64x2& a) -{ return vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); } +{ vuint64m1_t res = vundefined_u64m1(); return vmv_x_s_u64m1_u64(vredsum_vs_u64m1_u64m1(res, a.val, vmv_v_x_u64m1(0, 2), 2)); } inline int64 v_reduce_sum(const v_int64x2& a) -{ return vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); } +{ vint64m1_t res = vundefined_i64m1(); return vmv_x_s_i64m1_i64(vredsum_vs_i64m1_i64m1(res, a.val, vmv_v_x_i64m1(0, 2), 2)); } #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func) \ -OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8, i8, int, func, red##func, 16) \ -OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8) \ -OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4) \ -OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2) \ -OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8, u8, unsigned, func, red##func##u, 16) \ -OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8) \ -OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4) \ -OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4) +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8, i8, int, func, red##func, 16, vmv_x_s_i8m1_i8) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8, vmv_x_s_i16m1_i16) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4, vmv_x_s_i32m1_i32) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2, vmv_x_s_i64m1_i64) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8, u8, unsigned, func, red##func##u, 16, vmv_x_s_u8m1_u8) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8, vmv_x_s_u16m1_u16) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4, vmv_x_s_u32m1_u32) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4, vfmv_f_s_f32m1_f32) OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max) OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min) @@ -944,11 +1139,15 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4); vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4); vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4); - a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4); - b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4); - c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4); - d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4); - return v_float32x4(a0[0], b0[0], c0[0], d0[0]); + a0 = vfredosum_vs_f32m1_f32m1(a0, a.val, a0, 4); + b0 = vfredosum_vs_f32m1_f32m1(b0, b.val, b0, 4); + c0 = vfredosum_vs_f32m1_f32m1(c0, c.val, c0, 4); + d0 = vfredosum_vs_f32m1_f32m1(d0, d.val, d0, 4); + vfloat32m1_t res; + res = vslideup_vx_f32m1(a0, b0, 1, 4); + res = vslideup_vx_f32m1(res, c0, 2, 4); + res = vslideup_vx_f32m1(res, d0, 3, 4); + return v_float32x4(res); } inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) @@ -957,8 +1156,8 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4); vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4); vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4); - a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4); - return a0[0]; + a0 = vfredosum_vs_f32m1_f32m1(a0, val, a0, 4); + return vfmv_f_s_f32m1_f32(a0); } #define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \ @@ -1020,43 +1219,43 @@ inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); - return v_float32x4((vfloat32m1_t)res); + return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); - return v_float32x4((vfloat32m1_t)res); + return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); - return v_float32x4((vfloat32m1_t)res); + return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); - return v_float32x4((vfloat32m1_t)res); + return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); - return v_float32x4((vfloat32m1_t)res); + return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b) { vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); - return v_float32x4((vfloat32m1_t)res); -} + return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); +}/**/ inline v_float32x4 v_not_nan(const v_float32x4& a) { - vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4); + vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, a.val, 4); vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); - return v_float32x4((vfloat32m1_t)res); + return v_float32x4(vreinterpret_v_i32m1_f32m1(res)); } //TODO: == @@ -1064,43 +1263,43 @@ inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); - return v_float64x2((vfloat64m1_t)res); + return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); - return v_float64x2((vfloat64m1_t)res); + return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); - return v_float64x2((vfloat64m1_t)res); + return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); - return v_float64x2((vfloat64m1_t)res); + return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); - return v_float64x2((vfloat64m1_t)res); + return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b) { vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); - return v_float64x2((vfloat64m1_t)res); -} + return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); +}/**/ inline v_float64x2 v_not_nan(const v_float64x2& a) { - vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2); + vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, a.val, 2); vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); - return v_float64x2((vfloat64m1_t)res); + return v_float64x2(vreinterpret_v_i64m1_f64m1(res)); } #define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \ inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \ @@ -1108,16 +1307,23 @@ inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \ v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \ v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \ { \ + vuint32m4_t vindex = vundefined_u32m4(); \ + vuint32m1_t vindex0 = vid_v_u32m1(4); \ + vindex0 = vsll_vx_u32m1(vindex0, 2, 4); \ + vindex = vset_v_u32m1_u32m4(vindex, 0, vindex0); \ + vindex = vset_v_u32m1_u32m4(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4)); \ + vindex = vset_v_u32m1_u32m4(vindex, 2, vadd_vx_u32m1(vindex0, 2, 4)); \ + vindex = vset_v_u32m1_u32m4(vindex, 3, vadd_vx_u32m1(vindex0, 3, 4)); \ v##_Tp##32m4_t val = vundefined_##_T##m4(); \ - val = vset_##_T##m4(val, 0, a0.val); \ - val = vset_##_T##m4(val, 1, a1.val); \ - val = vset_##_T##m4(val, 2, a2.val); \ - val = vset_##_T##m4(val, 3, a3.val); \ - val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); \ - b0.val = vget_##_T##m4_##_T##m1(val, 0); \ - b1.val = vget_##_T##m4_##_T##m1(val, 1); \ - b2.val = vget_##_T##m4_##_T##m1(val, 2); \ - b3.val = vget_##_T##m4_##_T##m1(val, 3); \ + val = vset_v_##_T##m1_##_T##m4(val, 0, a0.val); \ + val = vset_v_##_T##m1_##_T##m4(val, 1, a1.val); \ + val = vset_v_##_T##m1_##_T##m4(val, 2, a2.val); \ + val = vset_v_##_T##m1_##_T##m4(val, 3, a3.val); \ + val = vrgather_vv_##_T##m4(val, vindex, 16); \ + b0.val = vget_v_##_T##m4_##_T##m1(val, 0); \ + b1.val = vget_v_##_T##m4_##_T##m1(val, 1); \ + b2.val = vget_v_##_T##m4_##_T##m1(val, 2); \ + b3.val = vget_v_##_T##m4_##_T##m1(val, 3); \ } OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32) OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32) @@ -1167,25 +1373,28 @@ template inline _Tpvec v_rotate_left(const _Tpvec& a) \ } \ template inline _Tpvec v_rotate_right(const _Tpvec& a) \ { \ - return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\ + suffix##m1_t res = vundefined_##_T##m1(); \ + return _Tpvec(vslidedown_vx_##_T##m1(res, a.val, n, num));\ } \ template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \ { return a; } \ template inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ { \ suffix##m2_t tmp = vundefined_##_T##m2(); \ - tmp = vset_##_T##m2(tmp, 0, a.val); \ - tmp = vset_##_T##m2(tmp, 1, b.val); \ - tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\ - return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\ + suffix##m2_t res = vundefined_##_T##m2(); \ + tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a.val); \ + tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, b.val); \ + res = vslidedown_vx_##_T##m2(res, tmp, n, num2);\ + return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 0));\ } \ template inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ { \ suffix##m2_t tmp = vundefined_##_T##m2(); \ - tmp = vset_##_T##m2(tmp, 0, b.val); \ - tmp = vset_##_T##m2(tmp, 1, a.val); \ - tmp = vslideup_vx_##_T##m2(tmp, n, num2);\ - return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\ + suffix##m2_t res = vundefined_##_T##m2(); \ + tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, b.val); \ + tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a.val); \ + res = vslideup_vx_##_T##m2(res, tmp, n, num2);\ + return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 1));\ } \ template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \ { \ @@ -1203,50 +1412,132 @@ OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64) OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32) OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64) -#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \ +#if 1 +#define vreinterpret_v_i8m1_i8m1 +#define vreinterpret_v_u8m1_u8m1 +#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize, ldst_len, ldst_type) \ inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ { \ - typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \ - vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\ - return _Tpvec(_Tp2##_t(tmp)); } \ + _Tp2##_t res = vundefined_##len(); \ + _Tp2##_t res1 = vundefined_##len(); \ + res = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr0, 8)); \ + res1 = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr1, 8)); \ + res = vslideup_vx_##len(res, res1, hnum, num); \ + return _Tpvec(res); } \ inline _Tpvec v_load_low(const _Tp* ptr) \ -{ return _Tpvec(vle_v_##len(ptr, hnum)); }\ +{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 8))); }\ inline _Tpvec v_load_aligned(const _Tp* ptr) \ -{ return _Tpvec(vle_v_##len(ptr, num)); } \ +{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \ inline _Tpvec v_load(const _Tp* ptr) \ -{ return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \ +{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ -{ vse_v_##len(ptr, a.val, hnum);}\ +{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ { \ - _Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num); \ - vse_v_##len(ptr, a0, hnum);}\ + _Tp2##_t a0 = vundefined_##len(); \ + a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \ + vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\ inline void v_store(_Tp* ptr, const _Tpvec& a) \ -{ vse_v_##len(ptr, a.val, num); } \ +{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ -{ vse_v_##len(ptr, a.val, num); } \ +{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \ inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ -{ vse_v_##len(ptr, a.val, num); } \ +{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \ inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ -{ vse_v_##len(ptr, a.val, num); } +{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } -OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16) -OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16, schar, vint8m1, i8m1, 8, 16) -OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8) -OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8, short, vint16m1, i16m1, 4, 8) -OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4) -OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4, int, vint32m1, i32m1, 2, 4) -OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2) -OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2, long, vint64m1, i64m1, 1, 2) -OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4) -OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8, u8m1, uchar) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16, schar, vint8m1, i8m1, 8, 16, 8, i8m1, schar) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16, u8m1, uchar) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8, short, vint16m1, i16m1, 4, 8, 16, i8m1, schar) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32, u8m1, uchar) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4, int, vint32m1, i32m1, 2, 4, 32, i8m1, schar) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64, u8m1, uchar) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2, long, vint64m1, i64m1, 1, 2, 64, i8m1, schar) +#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ \ + _Tp2##_t res = vundefined_##len(); \ + _Tp2##_t res1 = vundefined_##len(); \ + res = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr0, 8))); \ + res1 = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr1, 8))); \ + res = vslideup_vx_##len(res, res1, hnum, num); \ + return _Tpvec(res); } \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 8)))); }\ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ \ + _Tp2##_t a0 = vundefined_##len(); \ + a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \ + vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \ +inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ +{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ +{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64) + +#else + +#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ \ + _Tp2##_t res, res1; \ + res = vle##elemsize##_v_##len(ptr0, hnum); \ + res1 = vle##elemsize##_v_##len(ptr1, hnum); \ + res = vslideup_vx_##len(res, res1, hnum, num); \ + return _Tpvec(res); } \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ return _Tpvec(vle##elemsize##_v_##len(ptr, hnum)); }\ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec((_Tp2##_t)vle##elemsize##_v_##len((const _Tp *)ptr, num)); } \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ vse##elemsize##_v_##len(ptr, a.val, hnum);}\ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ \ + _Tp2##_t a0; \ + a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \ + vse##elemsize##_v_##len(ptr, a0, hnum);}\ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ vse##elemsize##_v_##len(ptr, a.val, num); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ vse##elemsize##_v_##len(ptr, a.val, num); } \ +inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ +{ vse##elemsize##_v_##len(ptr, a.val, num); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ +{ vse##elemsize##_v_##len(ptr, a.val, num); } + +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16, schar, vint8m1, i8m1, 8, 16, 8) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8, short, vint16m1, i16m1, 4, 8, 16) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4, int, vint32m1, i32m1, 2, 4, 32) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2, long, vint64m1, i64m1, 1, 2, 64) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64) + +#endif ////////////// Lookup table access //////////////////// inline v_int8x16 v_lut(const schar* tab, const int* idx) { -#if 1 +#if 0 schar CV_DECL_ALIGNED(32) elems[16] = { tab[idx[ 0]], @@ -1266,16 +1557,18 @@ inline v_int8x16 v_lut(const schar* tab, const int* idx) tab[idx[14]], tab[idx[15]] }; - return v_int8x16(vle_v_i8m1(elems, 16)); + return v_int8x16(vle8_v_i8m1(elems, 16)); #else - int32xm4_t index32 = vlev_int32xm4(idx, 16); - vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16); - vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16); - return v_int8x16(vlxbv_i8m1(tab, index, 16)); +#if __riscv_v == 7000 + return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, vle32_v_u32m4((unsigned int *)idx, 16), 16), 0, 16), 0, 16)); +#else + return v_int8x16(vloxei32_v_i8m1(tab, vle32_v_u32m4((unsigned int *)idx, 16), 16)); +#endif #endif } inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){ +#if 0 schar CV_DECL_ALIGNED(32) elems[16] = { tab[idx[0]], @@ -1295,10 +1588,24 @@ inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){ tab[idx[7]], tab[idx[7] + 1] }; - return v_int8x16(vle_v_i8m1(elems, 16)); + return v_int8x16(vle8_v_i8m1(elems, 16)); +#else + vuint32m4_t seq, index; + vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 8); + seq = vid_v_u32m4(16); + index = vsrl_vx_u32m4(seq, 1, 16); + vidx = vrgather_vv_u32m4(vidx, index, 16); + index = vadd_vv_u32m4(vand_vx_u32m4(seq, 1, 16), vidx, 16); +#if __riscv_v == 7000 + return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16)); +#else + return v_int8x16(vloxei32_v_i8m1(tab, index, 16)); +#endif +#endif } inline v_int8x16 v_lut_quads(const schar* tab, const int* idx) { +#if 0 schar CV_DECL_ALIGNED(32) elems[16] = { tab[idx[0]], @@ -1318,7 +1625,23 @@ inline v_int8x16 v_lut_quads(const schar* tab, const int* idx) tab[idx[3] + 2], tab[idx[3] + 3] }; - return v_int8x16(vle_v_i8m1(elems, 16)); + return v_int8x16(vle8_v_i8m1(elems, 16)); +#else + vuint32m4_t seq, index; + vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 4); + seq = vid_v_u32m4(16); + index = vsrl_vx_u32m4(seq, 2, 16); + vidx = vrgather_vv_u32m4(vidx, index, 16); + seq = vset_v_u32m1_u32m4(seq, 1, vget_v_u32m4_u32m1(seq, 0)); + seq = vset_v_u32m1_u32m4(seq, 2, vget_v_u32m4_u32m1(seq, 0)); + seq = vset_v_u32m1_u32m4(seq, 3, vget_v_u32m4_u32m1(seq, 0)); + index = vadd_vv_u32m4(seq, vidx, 16); +#if __riscv_v == 7000 + return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16)); +#else + return v_int8x16(vloxei32_v_i8m1(tab, index, 16)); +#endif +#endif } inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); } @@ -1327,6 +1650,7 @@ inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reint inline v_int16x8 v_lut(const short* tab, const int* idx) { +#if 0 short CV_DECL_ALIGNED(32) elems[8] = { tab[idx[0]], @@ -1338,10 +1662,18 @@ inline v_int16x8 v_lut(const short* tab, const int* idx) tab[idx[6]], tab[idx[7]] }; - return v_int16x8(vle_v_i16m1(elems, 8)); + return v_int16x8(vle16_v_i16m1(elems, 8)); +#else +#if __riscv_v == 7000 + return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8), 0, 8)); +#else + return v_int16x8(vloxei32_v_i16m1(tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8)); +#endif +#endif } inline v_int16x8 v_lut_pairs(const short* tab, const int* idx) { +#if 0 short CV_DECL_ALIGNED(32) elems[8] = { tab[idx[0]], @@ -1353,10 +1685,24 @@ inline v_int16x8 v_lut_pairs(const short* tab, const int* idx) tab[idx[3]], tab[idx[3] + 1] }; - return v_int16x8(vle_v_i16m1(elems, 8)); + return v_int16x8(vle16_v_i16m1(elems, 8)); +#else + vuint32m2_t seq, index; + vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 4); + seq = vid_v_u32m2(8); + index = vsrl_vx_u32m2(seq, 1, 8); + vidx = vrgather_vv_u32m2(vidx, index, 8); + index = vsll_vx_u32m2(vadd_vv_u32m2(vand_vx_u32m2(seq, 1, 8), vidx, 8), 1, 8); +#if __riscv_v == 7000 + return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8)); +#else + return v_int16x8(vloxei32_v_i16m1(tab, index, 8)); +#endif +#endif } inline v_int16x8 v_lut_quads(const short* tab, const int* idx) { +#if 0 short CV_DECL_ALIGNED(32) elems[8] = { tab[idx[0]], @@ -1368,7 +1714,21 @@ inline v_int16x8 v_lut_quads(const short* tab, const int* idx) tab[idx[1] + 2], tab[idx[1] + 3] }; - return v_int16x8(vle_v_i16m1(elems, 8)); + return v_int16x8(vle16_v_i16m1(elems, 8)); +#else + vuint32m2_t seq, index; + vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 2); + seq = vid_v_u32m2(8); + index = vsrl_vx_u32m2(seq, 2, 8); + vidx = vrgather_vv_u32m2(vidx, index, 8); + seq = vset_v_u32m1_u32m2(seq, 1, vget_v_u32m2_u32m1(seq, 0)); + index = vsll_vx_u32m2(vadd_vv_u32m2(seq, vidx, 8), 1, 8); +#if __riscv_v == 7000 + return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8)); +#else + return v_int16x8(vloxei32_v_i16m1(tab, index, 8)); +#endif +#endif } inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); } inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); } @@ -1376,6 +1736,7 @@ inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_rein inline v_int32x4 v_lut(const int* tab, const int* idx) { +#if 0 int CV_DECL_ALIGNED(32) elems[4] = { tab[idx[0]], @@ -1383,10 +1744,14 @@ inline v_int32x4 v_lut(const int* tab, const int* idx) tab[idx[2]], tab[idx[3]] }; - return v_int32x4(vle_v_i32m1(elems, 4)); + return v_int32x4(vle32_v_i32m1(elems, 4)); +#else + return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4)); +#endif } inline v_int32x4 v_lut_pairs(const int* tab, const int* idx) { +#if 0 int CV_DECL_ALIGNED(32) elems[4] = { tab[idx[0]], @@ -1394,11 +1759,20 @@ inline v_int32x4 v_lut_pairs(const int* tab, const int* idx) tab[idx[1]], tab[idx[1] + 1] }; - return v_int32x4(vle_v_i32m1(elems, 4)); + return v_int32x4(vle32_v_i32m1(elems, 4)); +#else + vuint32m1_t seq, index; + vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2); + seq = vid_v_u32m1(4); + index = vsrl_vx_u32m1(seq, 1, 4); + vidx = vrgather_vv_u32m1(vidx, index, 4); + index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4); + return v_int32x4(vloxei32_v_i32m1(tab, index, 4)); +#endif } inline v_int32x4 v_lut_quads(const int* tab, const int* idx) { - return v_int32x4(vle_v_i32m1(tab+idx[0], 4)); + return v_int32x4(vle32_v_i32m1(tab+idx[0], 4)); } inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); } inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); } @@ -1406,26 +1780,27 @@ inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_re inline v_int64x2 v_lut(const int64_t* tab, const int* idx) { - vint64m1_t res = {tab[idx[0]], tab[idx[1]]}; - return v_int64x2(res); + //vint64m1_t res = {tab[idx[0]], tab[idx[1]]}; + return v_int64x2(vloxei64_v_i64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2)); } inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx) { - return v_int64x2(vle_v_i64m1(tab+idx[0], 2)); + return v_int64x2(vle64_v_i64m1(tab+idx[0], 2)); } inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { - vuint64m1_t res = {tab[idx[0]], tab[idx[1]]}; - return v_uint64x2(res); + //vuint64m1_t res = {tab[idx[0]], tab[idx[1]]}; + return v_uint64x2(vloxei64_v_u64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2)); } inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { - return v_uint64x2(vle_v_u64m1(tab+idx[0], 2)); + return v_uint64x2(vle64_v_u64m1(tab+idx[0], 2)); } inline v_float32x4 v_lut(const float* tab, const int* idx) { +#if 0 float CV_DECL_ALIGNED(32) elems[4] = { tab[idx[0]], @@ -1433,10 +1808,14 @@ inline v_float32x4 v_lut(const float* tab, const int* idx) tab[idx[2]], tab[idx[3]] }; - return v_float32x4(vle_v_f32m1(elems, 4)); + return v_float32x4(vle32_v_f32m1(elems, 4)); +#else + return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4)); +#endif } inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { +#if 0 float CV_DECL_ALIGNED(32) elems[4] = { tab[idx[0]], @@ -1444,69 +1823,79 @@ inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) tab[idx[1]], tab[idx[1]+1] }; - return v_float32x4(vle_v_f32m1(elems, 4)); + return v_float32x4(vle32_v_f32m1(elems, 4)); +#else + vuint32m1_t seq, index; + vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2); + seq = vid_v_u32m1(4); + index = vsrl_vx_u32m1(seq, 1, 4); + vidx = vrgather_vv_u32m1(vidx, index, 4); + index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4); + return v_float32x4(vloxei32_v_f32m1(tab, index, 4)); +#endif } inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { - return v_float32x4(vle_v_f32m1(tab + idx[0], 4)); + return v_float32x4(vle32_v_f32m1(tab + idx[0], 4)); } inline v_float64x2 v_lut(const double* tab, const int* idx) { - vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]}; - return v_float64x2(res); + //vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]}; + return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2)); } inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { - return v_float64x2(vle_v_f64m1(tab+idx[0], 2)); + return v_float64x2(vle64_v_f64m1(tab+idx[0], 2)); } inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) { - int CV_DECL_ALIGNED(32) elems[4] = + /*int CV_DECL_ALIGNED(32) elems[4] = { tab[idxvec.val[0]], tab[idxvec.val[1]], tab[idxvec.val[2]], tab[idxvec.val[3]] - }; - return v_int32x4(vle_v_i32m1(elems, 4)); + };*/ + return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4)); } inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec) { - unsigned CV_DECL_ALIGNED(32) elems[4] = + /*unsigned CV_DECL_ALIGNED(32) elems[4] = { tab[idxvec.val[0]], tab[idxvec.val[1]], tab[idxvec.val[2]], tab[idxvec.val[3]] - }; - return v_uint32x4(vle_v_u32m1(elems, 4)); + };*/ + return v_uint32x4(vloxei32_v_u32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4)); } inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) { - float CV_DECL_ALIGNED(32) elems[4] = + /*float CV_DECL_ALIGNED(32) elems[4] = { tab[idxvec.val[0]], tab[idxvec.val[1]], tab[idxvec.val[2]], tab[idxvec.val[3]] - }; - return v_float32x4(vle_v_f32m1(elems, 4)); + };*/ + return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4)); } inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) { - vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]}; - return v_float64x2(res); + //vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]}; + return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vreinterpret_v_i64m1_u64m1(vget_v_i64m2_i64m1(vwadd_vx_i64m2(idxvec.val, 0, 2), 0)), 3, 2), 2)); } inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y) { - vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4); - vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4); + vint32m1_t index = vmul_vx_i32m1(idxvec.val, 4, 4); + //vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4); - x.val = vlxe_v_f32m1(tab, index_x, 4); - y.val = vlxe_v_f32m1(tab, index_y, 4); + //x.val = vlxe_v_f32m1(tab, index_x, 4); + //y.val = vlxe_v_f32m1(tab, index_y, 4); + vloxseg2ei32_v_f32m1(&x.val, &y.val, tab, vreinterpret_v_i32m1_u32m1(index), 4); } inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y) @@ -1518,52 +1907,52 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]); } -#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \ +#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type, elemsize) \ inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \ { \ v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \ - tmp = vset_##_T2##m2(tmp, 0, a.val); \ - tmp = vset_##_T2##m2(tmp, 1, b.val); \ + tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \ + tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val); \ return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \ }\ template inline \ v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \ { \ v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \ - tmp = vset_##_T2##m2(tmp, 0, a.val); \ - tmp = vset_##_T2##m2(tmp, 1, b.val); \ + tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \ + tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val); \ return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \ }\ inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \ { \ v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \ - tmp = vset_##_T2##m2(tmp, 0, a.val); \ - tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \ + tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \ + tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \ asm("" ::: "memory"); \ - vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \ + vse##elemsize##_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \ }\ template inline \ void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \ { \ v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \ - tmp = vset_##_T2##m2(tmp, 0, a.val); \ - tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \ - vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \ + tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \ + tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \ + vse##elemsize##_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \ } -OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx, signed char) -OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx, signed short) -OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx, int) -OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx, unsigned char) -OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx, unsigned short) -OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx, unsigned int) +OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_wx, vnclip_wx, signed char, 8) +OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_wx, vnclip_wx, signed short, 16) +OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_wx, vnsra_wx, int, 32) +OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_wx, vnclipu_wx, unsigned char, 8) +OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_wx, vnclipu_wx, unsigned short, 16) +OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_wx, vnsrl_wx, unsigned int, 32) // pack boolean inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) { vuint16m2_t tmp = vundefined_u16m2(); \ - tmp = vset_u16m2(tmp, 0, a.val); \ - tmp = vset_u16m2(tmp, 1, b.val); \ - return v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16)); + tmp = vset_v_u16m1_u16m2(tmp, 0, a.val); \ + tmp = vset_v_u16m1_u16m2(tmp, 1, b.val); \ + return v_uint8x16(vnsrl_wx_u8m1(tmp, 0, 16)); } inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, @@ -1571,12 +1960,12 @@ inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, { vuint32m4_t vabcd = vundefined_u32m4(); \ vuint16m2_t v16 = vundefined_u16m2(); \ - vabcd = vset_u32m4(vabcd, 0, a.val); \ - vabcd = vset_u32m4(vabcd, 1, b.val); \ - vabcd = vset_u32m4(vabcd, 2, c.val); \ - vabcd = vset_u32m4(vabcd, 3, d.val); \ - v16 = vnsrl_vx_u16m2(vabcd, 0, 16); - return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16)); + vabcd = vset_v_u32m1_u32m4(vabcd, 0, a.val); \ + vabcd = vset_v_u32m1_u32m4(vabcd, 1, b.val); \ + vabcd = vset_v_u32m1_u32m4(vabcd, 2, c.val); \ + vabcd = vset_v_u32m1_u32m4(vabcd, 3, d.val); \ + v16 = vnsrl_wx_u16m2(vabcd, 0, 16); + return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16)); } inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, @@ -1586,17 +1975,17 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin vuint64m8_t v64 = vundefined_u64m8(); \ vuint32m4_t v32 = vundefined_u32m4(); \ vuint16m2_t v16 = vundefined_u16m2(); \ - v64 = vset_u64m8(v64, 0, a.val); \ - v64 = vset_u64m8(v64, 1, b.val); \ - v64 = vset_u64m8(v64, 2, c.val); \ - v64 = vset_u64m8(v64, 3, d.val); \ - v64 = vset_u64m8(v64, 4, e.val); \ - v64 = vset_u64m8(v64, 5, f.val); \ - v64 = vset_u64m8(v64, 6, g.val); \ - v64 = vset_u64m8(v64, 7, h.val); \ - v32 = vnsrl_vx_u32m4(v64, 0, 16); - v16 = vnsrl_vx_u16m2(v32, 0, 16); - return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16)); + v64 = vset_v_u64m1_u64m8(v64, 0, a.val); \ + v64 = vset_v_u64m1_u64m8(v64, 1, b.val); \ + v64 = vset_v_u64m1_u64m8(v64, 2, c.val); \ + v64 = vset_v_u64m1_u64m8(v64, 3, d.val); \ + v64 = vset_v_u64m1_u64m8(v64, 4, e.val); \ + v64 = vset_v_u64m1_u64m8(v64, 5, f.val); \ + v64 = vset_v_u64m1_u64m8(v64, 6, g.val); \ + v64 = vset_v_u64m1_u64m8(v64, 7, h.val); \ + v32 = vnsrl_wx_u32m4(v64, 0, 16); + v16 = vnsrl_wx_u16m2(v32, 0, 16); + return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16)); } //inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \ @@ -1612,35 +2001,35 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \ { \ vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \ - tmp = vset_##i##tp2##m2(tmp, 0, a.val); \ - tmp = vset_##i##tp2##m2(tmp, 1, b.val); \ + tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \ + tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val); \ vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\ - return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1)); \ + return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1)); \ } \ inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \ { \ vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \ - tmp = vset_##i##tp2##m2(tmp, 0, a.val); \ + tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \ vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\ - return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2); \ + return vse##tp1##_v_u##tp1##m1(ptr, vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1), num2); \ } \ template inline \ v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \ { \ vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \ - tmp = vset_##i##tp2##m2(tmp, 0, a.val); \ - tmp = vset_##i##tp2##m2(tmp, 1, b.val); \ + tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \ + tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val); \ vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\ - return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1)); \ + return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), n, num1)); \ } \ template inline \ void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \ { \ vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \ - tmp = vset_##i##tp2##m2(tmp, 0, a.val); \ + tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \ vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\ - vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1); \ - return vse_v_u##tp1##m1(ptr, val, num2);\ + vuint##tp1##m1_t val = vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val_), n, num1); \ + return vse##tp1##_v_u##tp1##m1(ptr, val, num2);\ } OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char ) OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short) @@ -1690,8 +2079,12 @@ static const signed char popCountTable[256] = }; inline vuint8m1_t vcnt_u8(vuint8m1_t val){ - vuint8m1_t v0 = val & 1; - return vlxe_v_u8m1((unsigned char*)popCountTable, val >> 1, 16)+v0; +#if __riscv_v == 7000 + vuint8m1_t v0 = vand_vx_u8m1(val, 1, 16); + return vadd_vv_u8m1(vloxei8_v_u8m1((unsigned char*)popCountTable, vsrl_vx_u8m1(val, 1, 16), 16), v0, 16); +#else + return vloxei8_v_u8m1((unsigned char*)popCountTable, val, 16); +#endif } inline v_uint8x16 @@ -1703,156 +2096,138 @@ v_popcount(const v_uint8x16& a) inline v_uint8x16 v_popcount(const v_int8x16& a) { - return v_uint8x16(vcnt_u8((vuint8m1_t)a.val)); + return v_uint8x16(vcnt_u8(vreinterpret_v_i8m1_u8m1(a.val))); } inline v_uint16x8 v_popcount(const v_uint16x8& a) { - vuint8m2_t tmp = vundefined_u8m2(); - tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val)); - vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0}; - tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \ - vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8); - return v_uint16x8(vget_u16m2_u16m1(res, 0)); + vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u16m1_u8m1(a.val)); + vuint8m1_t seq = vid_v_u8m1(8); + vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8); + return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0)); } inline v_uint16x8 v_popcount(const v_int16x8& a) { - vuint8m2_t tmp = vundefined_u8m2(); - tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val)); - vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0}; - tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \ - vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8); - return v_uint16x8(vget_u16m2_u16m1(res, 0)); + vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(a.val))); + vuint8m1_t seq = vid_v_u8m1(8); + vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8); + return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0)); } inline v_uint32x4 v_popcount(const v_uint32x4& a) { - vuint8m2_t tmp = vundefined_u8m2(); - tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val)); - vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501, - 0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703}; - tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \ - vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16); - vuint32m2_t res = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8); - return v_uint32x4(vget_u32m2_u32m1(res, 0)); + vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u32m1_u8m1(a.val)); + vuint8m1_t seq = vid_v_u8m1(8); + vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8); + vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8); + return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0)); } inline v_uint32x4 v_popcount(const v_int32x4& a) { - vuint8m2_t tmp = vundefined_u8m2(); - tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val)); - vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501, - 0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703}; - tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \ - vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16); - vuint32m2_t res = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8); - return v_uint32x4(vget_u32m2_u32m1(res, 0)); + vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(a.val))); + vuint8m1_t seq = vid_v_u8m1(8); + vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8); + vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8); + return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0)); } inline v_uint64x2 v_popcount(const v_uint64x2& a) { - vuint8m2_t tmp = vundefined_u8m2(); - tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val)); - vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000, - 0x0F0E0D0C0B0A0908, 0x0000000000000000}; - tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \ - vuint8m1_t zero = vmv_v_x_u8m1(0, 16); - vuint8m1_t res1 = zero; - vuint8m1_t res2 = zero; - res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8); - res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8); - - return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8)); + vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u64m1_u8m1(a.val)); + vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16); + vuint16m1_t res1 = vundefined_u16m1(); + vuint16m1_t res2 = vundefined_u16m1(); + res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8); + res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8); + return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2)); } inline v_uint64x2 v_popcount(const v_int64x2& a) { - vuint8m2_t tmp = vundefined_u8m2(); - tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val)); - vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000, - 0x0F0E0D0C0B0A0908, 0x0000000000000000}; - tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \ - vuint8m1_t zero = vmv_v_x_u8m1(0, 16); - vuint8m1_t res1 = zero; - vuint8m1_t res2 = zero; - res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8); - res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8); - - return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8)); + vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(a.val))); + vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16); + vuint16m1_t res1 = vundefined_u16m1(), res2 = vundefined_u16m1(); + res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8); + res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8); + return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2)); } #define SMASK 1, 2, 4, 8, 16, 32, 64, 128 inline int v_signmask(const v_uint8x16& a) { + vuint16m1_t res = vundefined_u16m1(); + vuint8m1_t id = vid_v_u8m1(16); + vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16); vuint8m1_t t0 = vsrl_vx_u8m1(a.val, 7, 16); - vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK}; - vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16); - vuint32m1_t res = vmv_v_x_u32m1(0, 4); - vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8); - res = vredsum_vs_u32m2_u32m1(res, t2, res, 8); - res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8); - return vmv_x_s_u32m1_u32(res, 8); + vbool8_t mask = vmseq_vx_u8m1_b8(t0, 1, 16); + res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16); + return vmv_x_s_u16m1_u16(res); } inline int v_signmask(const v_int8x16& a) { - vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16); - vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK}; - vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16); - vint32m1_t res = vmv_v_x_i32m1(0, 4); - vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8); - res = vredsum_vs_i32m2_i32m1(res, t2, res, 8); - res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8); - return vmv_x_s_i32m1_i32(res, 8); + vuint16m1_t res = vundefined_u16m1(); + vuint8m1_t id = vid_v_u8m1(16); + vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16); + vbool8_t mask = vmslt_vx_i8m1_b8(a.val, 0, 16); + res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16); + return vmv_x_s_u16m1_u16(res); } inline int v_signmask(const v_int16x8& a) { - vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8); - vint16m1_t m1 = (vint16m1_t){SMASK}; - vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8); - vint16m1_t res = vmv_v_x_i16m1(0, 8); - res = vredsum_vs_i16m1_i16m1(res, t1, res, 8); - return vmv_x_s_i16m1_i16(res, 8); + vuint16m1_t res = vundefined_u16m1(); + vuint16m1_t id = vid_v_u16m1(8); + vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8); + vbool16_t mask = vmslt_vx_i16m1_b16(a.val, 0, 8); + res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16); + return vmv_x_s_u16m1_u16(res); } inline int v_signmask(const v_uint16x8& a) { - vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8); - vint16m1_t m1 = (vint16m1_t){SMASK}; - vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8); - vint16m1_t res = vmv_v_x_i16m1(0, 8); - res = vredsum_vs_i16m1_i16m1(res, t1, res, 8); - return vmv_x_s_i16m1_i16(res, 8); + vuint16m1_t res = vundefined_u16m1(); + vuint16m1_t id = vid_v_u16m1(8); + vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8); + vuint16m1_t t0 = vsrl_vx_u16m1(a.val, 15, 8); + vbool16_t mask = vmseq_vx_u16m1_b16(t0, 1, 8); + res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 8); + return vmv_x_s_u16m1_u16(res); } inline int v_signmask(const v_int32x4& a) { - vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4); - vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8}; - vint32m1_t res = vmv_v_x_i32m1(0, 4); - vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4); - res = vredsum_vs_i32m1_i32m1(res, t1, res, 4); - return vmv_x_s_i32m1_i32(res, 4); + vuint32m1_t res = vundefined_u32m1(); + vuint32m1_t id = vid_v_u32m1(4); + vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4); + vbool32_t mask = vmslt_vx_i32m1_b32(a.val, 0, 4); + res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4); + return vmv_x_s_u32m1_u32(res); } inline int v_signmask(const v_uint32x4& a) { - vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4); - vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8}; - vint32m1_t res = vmv_v_x_i32m1(0, 4); - vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4); - res = vredsum_vs_i32m1_i32m1(res, t1, res, 4); - return vmv_x_s_i32m1_i32(res, 4); + vuint32m1_t res = vundefined_u32m1(); + vuint32m1_t id = vid_v_u32m1(4); + vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4); + vuint32m1_t t0 = vsrl_vx_u32m1(a.val, 31, 4); + vbool32_t mask = vmseq_vx_u32m1_b32(t0, 1, 4); + res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4); + return vmv_x_s_u32m1_u32(res); } inline int v_signmask(const v_uint64x2& a) { - vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2); - int res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1); - return res; + vuint64m1_t res = vundefined_u64m1(); + vuint64m1_t id = vid_v_u64m1(2); + vuint64m1_t num = vsll_vv_u64m1(vmv_v_x_u64m1(1, 2), id, 2); + vuint64m1_t t0 = vsrl_vx_u64m1(a.val, 63, 2); + vbool64_t mask = vmseq_vx_u64m1_b64(t0, 1, 2); + res = vredsum_vs_u64m1_u64m1_m(mask, res, num, vmv_v_x_u64m1(0, 2), 2); + return vmv_x_s_u64m1_u64(res); } inline int v_signmask(const v_int64x2& a) { return v_signmask(v_reinterpret_as_u64(a)); } @@ -1860,12 +2235,14 @@ inline int v_signmask(const v_float64x2& a) { return v_signmask(v_reinterpret_as_u64(a)); } inline int v_signmask(const v_float32x4& a) { - vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4); - vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8}; - vint32m1_t res = vmv_v_x_i32m1(0, 4); - vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4); - res = vredsum_vs_i32m1_i32m1(res, t1, res, 4); - return vmv_x_s_i32m1_i32(res, 4); + return v_signmask(v_reinterpret_as_u32(a)); + /* + vuint32m1_t res; + vuint32m1_t id = vid_v_u32m1(4); + vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4); + vbool32_t mask = vmflt_vf_f32m1_b32(a.val, 0, 4); + res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4); + return vmv_x_s_u32m1_u32(res);*/ } inline int v_scan_forward(const v_int8x16& a) { @@ -1905,24 +2282,22 @@ int val = v_signmask(a); if(val==0) return 0; else return trailingZeros32(val); } -#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \ +#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num, mask_b) \ inline bool v_check_all(const v_##_Tpvec& a) \ { \ suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \ - vuint32m1_t v1 = vuint32m1_t(v0); \ - return (v1[0] | v1[1] | v1[2] | v1[3]) == 0; \ + return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) == 0; \ } \ inline bool v_check_any(const v_##_Tpvec& a) \ { \ suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \ - vuint32m1_t v1 = vuint32m1_t(v0); \ - return (v1[0] | v1[1] | v1[2] | v1[3]) != 0; \ + return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) != 0; \ } -OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8, u8m1, 7, 16) -OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8) -OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4) -OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2) +OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8, u8m1, 7, 16, b8) +OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8, b16) +OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4, b32) +OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2, b64) inline bool v_check_all(const v_int8x16& a) { return v_check_all(v_reinterpret_as_u8(a)); } @@ -1950,92 +2325,93 @@ inline bool v_check_any(const v_int64x2& a) inline bool v_check_any(const v_float64x2& a) { return v_check_any(v_reinterpret_as_u64(a)); } -#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \ +#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num, mask_func) \ inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ { \ - return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \ + return _Tpvec(vmerge_vvm_##suffix(mask_func(mask.val, 0, num), b.val, a.val, num)); \ } -OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16, i8m1, vbool8_t, 16) -OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8, i16m1, vbool16_t, 8) -OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4, i32m1, vbool32_t, 4) -OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16) -OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8) -OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4) +OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16, i8m1, vbool8_t, 16, vmsne_vx_i8m1_b8) +OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8, i16m1, vbool16_t, 8, vmsne_vx_i16m1_b16) +OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4, i32m1, vbool32_t, 4, vmsne_vx_i32m1_b32) +OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16, vmsne_vx_u8m1_b8) +OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8, vmsne_vx_u16m1_b16) +OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4, vmsne_vx_u32m1_b32) inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b) { - return v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4)); + return v_float32x4(vmerge_vvm_f32m1(vmfne_vf_f32m1_b32(mask.val, 0, 4), b.val, a.val, 4)); } inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b) { - return v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2)); + return v_float64x2(vmerge_vvm_f64m1(vmfne_vf_f64m1_b64(mask.val, 0, 2), b.val, a.val, 2)); } -#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \ +#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2, num3) \ inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \ { \ - _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1); \ - b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0); \ - b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1); \ + _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1); \ + b0.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 0); \ + b1.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 1); \ } \ inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \ { \ - _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2); \ - return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \ + _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num2); \ + return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \ } \ inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \ { \ - _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1); \ - return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \ + _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1); \ + return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 1)); \ } \ inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \ { \ - _T2##_t val = vle##_v_##_Tp1(ptr, num2); \ - _T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2); \ - return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \ + _T2##_t val = vle##num3##_v_##_Tp1(ptr, num2); \ + _T1##_t b = vw##add##_vx_##_Tp2##m2(val, 0, num2); \ + return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \ } -OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1) -OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort, u16m1, 8, u32, 4, vuint32m2, vuint16m1) -OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint, u32m1, 4, u64, 2, vuint64m2, vuint32m1) -OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar, i8m1, 16, i16, 8, vint16m2, vint8m1) -OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short, i16m1, 8, i32, 4, vint32m2, vint16m1) -OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int, i32m1, 4, i64, 2, vint64m2, vint32m1) +OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1, 8) +OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort, u16m1, 8, u32, 4, vuint32m2, vuint16m1, 16) +OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint, u32m1, 4, u64, 2, vuint64m2, vuint32m1, 32) +OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar, i8m1, 16, i16, 8, vint16m2, vint8m1, 8) +OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short, i16m1, 8, i32, 4, vint32m2, vint16m1, 16) +OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int, i32m1, 4, i64, 2, vint64m2, vint32m1, 32) inline v_uint32x4 v_load_expand_q(const uchar* ptr) { vuint16m2_t b = vundefined_u16m2(); vuint32m2_t c = vundefined_u32m2(); - vuint8m1_t val = vle_v_u8m1(ptr, 4); \ + vuint8m1_t val = vle8_v_u8m1(ptr, 4); \ b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4); \ - c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4); \ - return v_uint32x4(vget_u32m2_u32m1(c, 0)); + c = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4); \ + return v_uint32x4(vget_v_u32m2_u32m1(c, 0)); } inline v_int32x4 v_load_expand_q(const schar* ptr) { vint16m2_t b = vundefined_i16m2(); vint32m2_t c = vundefined_i32m2(); - vint8m1_t val = vle_v_i8m1(ptr, 4); \ + vint8m1_t val = vle8_v_i8m1(ptr, 4); \ b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4); \ - c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \ - return v_int32x4(vget_i32m2_i32m1(c, 0)); + c = vwadd_vv_i32m2(vget_v_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \ + return v_int32x4(vget_v_i32m2_i32m1(c, 0)); } -#define VITL_16 (vuint32m2_t){0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E} -#define VITL_8 (vuint32m2_t){0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007} -#define VITL_4 (vuint32m2_t){0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007} -#define VITL_2 (vuint32m2_t){0, 0, 2, 0, 1, 0, 3, 0} +#define VITL_16 {0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E} +#define VITL_8 {0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007} +#define VITL_4 {0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007} +#define VITL_2 {0, 0, 2, 0, 1, 0, 3, 0} -#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \ +#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh, refunc) \ inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \ { \ v##_Tp##m2_t tmp = vundefined_##_T##m2();\ - tmp = vset_##_T##m2(tmp, 0, a0.val); \ - tmp = vset_##_T##m2(tmp, 1, a1.val); \ - vuint32m2_t mask = VITL_##num; \ - tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2); \ - b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \ - b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \ + tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a0.val); \ + tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a1.val); \ + unsigned mdata[] = VITL_##num; \ + vuint32m2_t mask = vle32_v_u32m2(mdata, 8); \ + tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, refunc(mask), num2); \ + b0.val = vget_v_##_T##m2_##_T##m1(tmp, 0); \ + b1.val = vget_v_##_T##m2_##_T##m1(tmp, 1); \ } \ inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \ { \ @@ -2044,58 +2420,59 @@ inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \ } \ inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \ { \ - v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num); \ - v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num); \ - v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \ + v##_Tp##m1_t b0 = vundefined_##_T##m1(); \ + v##_Tp##m1_t a0 = vundefined_##_T##m1(); \ + v##_Tp##m1_t b1 = vundefined_##_T##m1(); \ + b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num); \ + a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num); \ + b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \ return v_##_Tpvec(b1);\ } \ inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \ { \ + v##_Tp##m1_t b0 = vundefined_##_T##m1(); \ + v##_Tp##m1_t a0 = vundefined_##_T##m1(); \ c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \ - v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num); \ - v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num); \ + b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num); \ + a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num); \ d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \ } -OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8) -OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8) -OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4) -OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4) -OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2) -OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2) -OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2) -OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2,) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2,) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2,) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1, vreinterpret_v_u32m2_u64m2) inline v_uint8x16 v_reverse(const v_uint8x16 &a) { - vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607}; - return v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16)); + return v_uint8x16(vrgather_vv_u8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16)); } inline v_int8x16 v_reverse(const v_int8x16 &a) { - vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607}; - return v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16)); + return v_int8x16(vrgather_vv_i8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16)); } inline v_uint16x8 v_reverse(const v_uint16x8 &a) { - vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003}; - return v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8)); + return v_uint16x8(vrgather_vv_u16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8)); } inline v_int16x8 v_reverse(const v_int16x8 &a) { - vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003}; - return v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8)); + return v_int16x8(vrgather_vv_i16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8)); } inline v_uint32x4 v_reverse(const v_uint32x4 &a) { - return v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4)); + return v_uint32x4(vrgather_vv_u32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4)); } inline v_int32x4 v_reverse(const v_int32x4 &a) { - return v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4)); + return v_int32x4(vrgather_vv_i32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4)); } inline v_float32x4 v_reverse(const v_float32x4 &a) @@ -2103,17 +2480,17 @@ inline v_float32x4 v_reverse(const v_float32x4 &a) inline v_uint64x2 v_reverse(const v_uint64x2 &a) { - return v_uint64x2(a.val[1], a.val[0]); + return v_uint64x2(vrgather_vv_u64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2)); } inline v_int64x2 v_reverse(const v_int64x2 &a) { - return v_int64x2(a.val[1], a.val[0]); + return v_int64x2(vrgather_vv_i64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2)); } inline v_float64x2 v_reverse(const v_float64x2 &a) { - return v_float64x2(a.val[1], a.val[0]); + return v_float64x2(vrgather_vv_f64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2)); } #define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \ @@ -2132,19 +2509,19 @@ OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2) OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3) -#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \ -template inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; } +#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix, vtype, _vtype, num, mvfunc) \ +template inline _Tp v_extract_n(_Tpvec v) { vtype tmp = vundefined_##_vtype(); return mvfunc(vslidedown_vx_##_vtype(tmp, v.val, i, num)); } -OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8) -OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8) -OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16) -OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16) -OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32) -OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32) -OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64) -OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64) -OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32) -OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8, vuint8m1_t, u8m1, 16, vmv_x_s_u8m1_u8) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8, vint8m1_t, i8m1, 16, vmv_x_s_i8m1_i8) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16, vuint16m1_t, u16m1, 8, vmv_x_s_u16m1_u16) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16, vint16m1_t, i16m1, 8, vmv_x_s_i16m1_i16) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32, vuint32m1_t, u32m1, 4, vmv_x_s_u32m1_u32) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32, vint32m1_t, i32m1, 4, vmv_x_s_i32m1_i32) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64, vuint64m1_t, u64m1, 2, vmv_x_s_u64m1_u64) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64, vint64m1_t, i64m1, 2, vmv_x_s_i64m1_i64) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32, vfloat32m1_t, f32m1, 4, vfmv_f_s_f32m1_f32) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64, vfloat64m1_t, f64m1, 2, vfmv_f_s_f64m1_f64) #define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \ template inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); } @@ -2158,10 +2535,24 @@ OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4) OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2) OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2) OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4) + +inline void __builtin_riscv_fsrm(int val) +{ + asm("csrw frm, %0\n\t" + : + :"r"(val)); + return; +} + +inline void barrier1(void *arg) { + __asm__ __volatile__("" : : "r" (arg) : "memory"); +} + inline v_int32x4 v_round(const v_float32x4& a) { __builtin_riscv_fsrm(0); - vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4); + vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4); + barrier1(&nan); vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4); __builtin_riscv_fsrm(0); @@ -2170,7 +2561,8 @@ inline v_int32x4 v_round(const v_float32x4& a) inline v_int32x4 v_floor(const v_float32x4& a) { __builtin_riscv_fsrm(2); - vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4); + vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4); + barrier1(&nan); vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4); __builtin_riscv_fsrm(0); @@ -2180,7 +2572,8 @@ inline v_int32x4 v_floor(const v_float32x4& a) inline v_int32x4 v_ceil(const v_float32x4& a) { __builtin_riscv_fsrm(3); - vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4); + vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4); + barrier1(&nan); vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4); __builtin_riscv_fsrm(0); @@ -2190,7 +2583,8 @@ inline v_int32x4 v_ceil(const v_float32x4& a) inline v_int32x4 v_trunc(const v_float32x4& a) { __builtin_riscv_fsrm(1); - vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4); + vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4); + barrier1(&nan); vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4); __builtin_riscv_fsrm(0); @@ -2201,10 +2595,11 @@ inline v_int32x4 v_round(const v_float64x2& a) { __builtin_riscv_fsrm(0); vfloat64m2_t _val = vundefined_f64m2(); - _val = vset_f64m2(_val, 0, a.val); + _val = vset_v_f64m1_f64m2(_val, 0, a.val); //_val = vset_f64m2(_val, 1, a.val); - _val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2)); - vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4); + _val = vset_v_f64m1_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2)); + barrier1(&_val); + vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4); __builtin_riscv_fsrm(0); return v_int32x4(val); } @@ -2212,9 +2607,10 @@ inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b) { __builtin_riscv_fsrm(0); vfloat64m2_t _val = vundefined_f64m2(); - _val = vset_f64m2(_val, 0, a.val); - _val = vset_f64m2(_val, 1, b.val); - vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4); + _val = vset_v_f64m1_f64m2(_val, 0, a.val); + _val = vset_v_f64m1_f64m2(_val, 1, b.val); + barrier1(&_val); + vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4); __builtin_riscv_fsrm(0); return v_int32x4(val); } @@ -2222,10 +2618,10 @@ inline v_int32x4 v_floor(const v_float64x2& a) { __builtin_riscv_fsrm(2); vfloat64m2_t _val = vundefined_f64m2(); - _val = vset_f64m2(_val, 0, a.val); - vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2); - - vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4); + _val = vset_v_f64m1_f64m2(_val, 0, a.val); + vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2); + vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4); + barrier1(&nan); vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4); __builtin_riscv_fsrm(0); @@ -2236,10 +2632,10 @@ inline v_int32x4 v_ceil(const v_float64x2& a) { __builtin_riscv_fsrm(3); vfloat64m2_t _val = vundefined_f64m2(); - _val = vset_f64m2(_val, 0, a.val); - vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2); - - vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4); + _val = vset_v_f64m1_f64m2(_val, 0, a.val); + vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2); + vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4); + barrier1(&nan); vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4); __builtin_riscv_fsrm(0); @@ -2250,139 +2646,86 @@ inline v_int32x4 v_trunc(const v_float64x2& a) { __builtin_riscv_fsrm(1); vfloat64m2_t _val = vundefined_f64m2(); - _val = vset_f64m2(_val, 0, a.val); - vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2); - - vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4); + _val = vset_v_f64m1_f64m2(_val, 0, a.val); + vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2); + vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4); + barrier1(&nan); vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4); __builtin_riscv_fsrm(0); return v_int32x4(val); } -#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T) \ +#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize) \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \ { \ - v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\ - a.val = vget_##_T##m1x2_##_T##m1(ret, 0); \ - b.val = vget_##_T##m1x2_##_T##m1(ret, 1); \ + intrin##2e##elemsize##_v_##_T##m1(&a.val, &b.val, ptr, num); \ } \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \ { \ - v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\ - a.val = vget_##_T##m1x3_##_T##m1(ret, 0); \ - b.val = vget_##_T##m1x3_##_T##m1(ret, 1); \ - c.val = vget_##_T##m1x3_##_T##m1(ret, 2); \ + intrin##3e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num); \ }\ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \ v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \ { \ - v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\ - a.val = vget_##_T##m1x4_##_T##m1(ret, 0); \ - b.val = vget_##_T##m1x4_##_T##m1(ret, 1); \ - c.val = vget_##_T##m1x4_##_T##m1(ret, 2); \ - d.val = vget_##_T##m1x4_##_T##m1(ret, 3); \ + intrin##4e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num); \ } \ -#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T) \ +#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize) \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ { \ - v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2(); \ - ret = vset_##_T##m1x2(ret, 0, a.val); \ - ret = vset_##_T##m1x2(ret, 1, b.val); \ - intrin##2e_v_##_T##m1x2(ptr, ret, num); \ + intrin##2e##elemsize##_v_##_T##m1(ptr, a.val, b.val, num); \ } \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \ const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ { \ - v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3(); \ - ret = vset_##_T##m1x3(ret, 0, a.val); \ - ret = vset_##_T##m1x3(ret, 1, b.val); \ - ret = vset_##_T##m1x3(ret, 2, c.val); \ - intrin##3e_v_##_T##m1x3(ptr, ret, num); \ + intrin##3e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, num); \ } \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \ const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \ { \ - v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4(); \ - ret = vset_##_T##m1x4(ret, 0, a.val); \ - ret = vset_##_T##m1x4(ret, 1, b.val); \ - ret = vset_##_T##m1x4(ret, 2, c.val); \ - ret = vset_##_T##m1x4(ret, 3, d.val); \ - intrin##4e_v_##_T##m1x4(ptr, ret, num); \ + intrin##4e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num); \ } -#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \ -OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T) \ -OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T) +#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T, elemsize) \ +OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T, elemsize) \ +OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T, elemsize) //OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, ) -OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8) -OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16) -OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8, 8) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16, 16) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32, 32) -OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8) -OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16) -OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8, 8) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16, 16) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32, 32) -#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \ +#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T, _esize) \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \ -{ \ - v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \ - a.val = vget_##_T##m1x2_##_T##m1(ret, 0); \ - b.val = vget_##_T##m1x2_##_T##m1(ret, 1); \ -} \ +{ vlseg2e##_esize##_v_##_T##m1(&a.val, &b.val, ptr, num);} \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \ -{ \ - v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num); \ - a.val = vget_##_T##m1x3_##_T##m1(ret, 0); \ - b.val = vget_##_T##m1x3_##_T##m1(ret, 1); \ - c.val = vget_##_T##m1x3_##_T##m1(ret, 2); \ -}\ +{ vlseg3e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num);}\ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \ v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \ -{ \ - v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num); \ - a.val = vget_##_T##m1x4_##_T##m1(ret, 0); \ - b.val = vget_##_T##m1x4_##_T##m1(ret, 1); \ - c.val = vget_##_T##m1x4_##_T##m1(ret, 2); \ - d.val = vget_##_T##m1x4_##_T##m1(ret, 3); \ -} \ +{ vlseg4e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num);} \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ -{ \ - v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2(); \ - ret = vset_##_T##m1x2(ret, 0, a.val); \ - ret = vset_##_T##m1x2(ret, 1, b.val); \ - vsseg2e_v_##_T##m1x2(ptr, ret, num); \ -} \ +{ vsseg2e##_esize##_v_##_T##m1(ptr, a.val, b.val, num);} \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \ const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ -{ \ - v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3(); \ - ret = vset_##_T##m1x3(ret, 0, a.val); \ - ret = vset_##_T##m1x3(ret, 1, b.val); \ - ret = vset_##_T##m1x3(ret, 2, c.val); \ - vsseg3e_v_##_T##m1x3(ptr, ret, num); \ -} \ +{ vsseg3e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, num);} \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \ const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \ -{ \ - v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4(); \ - ret = vset_##_T##m1x4(ret, 0, a.val); \ - ret = vset_##_T##m1x4(ret, 1, b.val); \ - ret = vset_##_T##m1x4(ret, 2, c.val); \ - ret = vset_##_T##m1x4(ret, 3, d.val); \ - vsseg4e_v_##_T##m1x4(ptr, ret, num); \ -} -OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32) -OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64) +{ vsseg4e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num);} -OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64) -OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32, 32) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64, 64) + +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64, 64) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64, 64) inline v_float32x4 v_cvt_f32(const v_int32x4& a) { @@ -2393,17 +2736,17 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a) inline v_float32x4 v_cvt_f32(const v_float64x2& a) { vfloat64m2_t _val = vundefined_f64m2(); - _val = vset_f64m2(_val, 0, a.val); - vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2); + _val = vset_v_f64m1_f64m2(_val, 0, a.val); + vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2); return v_float32x4(aval); } inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b) { vfloat64m2_t _val = vundefined_f64m2(); - _val = vset_f64m2(_val, 0, a.val); - _val = vset_f64m2(_val, 1, b.val); - vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4); + _val = vset_v_f64m1_f64m2(_val, 0, a.val); + _val = vset_v_f64m1_f64m2(_val, 1, b.val); + vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 4); return v_float32x4(aval); } @@ -2411,26 +2754,26 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a) { vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4); vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4); - return v_float64x2(vget_f64m2_f64m1(_val, 0)); + return v_float64x2(vget_v_f64m2_f64m1(_val, 0)); } inline v_float64x2 v_cvt_f64_high(const v_int32x4& a) { vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4); vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4); - return v_float64x2(vget_f64m2_f64m1(_val, 1)); + return v_float64x2(vget_v_f64m2_f64m1(_val, 1)); } inline v_float64x2 v_cvt_f64(const v_float32x4& a) { vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4); - return v_float64x2(vget_f64m2_f64m1(_val, 0)); + return v_float64x2(vget_v_f64m2_f64m1(_val, 0)); } inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) { vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4); - return v_float64x2(vget_f64m2_f64m1(_val, 1)); + return v_float64x2(vget_v_f64m2_f64m1(_val, 1)); } inline v_float64x2 v_cvt_f64(const v_int64x2& a) @@ -2441,8 +2784,9 @@ inline v_float64x2 v_cvt_f64(const v_int64x2& a) #endif inline v_int8x16 v_interleave_pairs(const v_int8x16& vec) { - vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08}; - return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16)); + uint64 mdata[2] = {0x0705060403010200, 0x0F0D0E0C0B090A08}; + vuint64m1_t m0 = vle64_v_u64m1(mdata, 2); + return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16)); } inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { @@ -2451,8 +2795,9 @@ inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) inline v_int8x16 v_interleave_quads(const v_int8x16& vec) { - vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08}; - return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16)); + uint64 mdata[2] = {0x0703060205010400, 0x0F0B0E0A0D090C08}; + vuint64m1_t m0 = vle64_v_u64m1(mdata, 2); + return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16)); } inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { @@ -2461,35 +2806,40 @@ inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) inline v_int16x8 v_interleave_pairs(const v_int16x8& vec) { - vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908}; - return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16)); + uint64 mdata[2] = {0x0706030205040100, 0x0F0E0B0A0D0C0908}; + vuint64m1_t m0 = vle64_v_u64m1(mdata, 2); + return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16)))); } inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } inline v_int16x8 v_interleave_quads(const v_int16x8& vec) { - vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504}; - return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16)); + uint64 mdata[2] = {0x0B0A030209080100, 0x0F0E07060D0C0504}; + vuint64m1_t m0 = vle64_v_u64m1(mdata, 2); + return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16)))); } inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } inline v_int32x4 v_interleave_pairs(const v_int32x4& vec) { - vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504}; - return v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16)); + uint64 mdata[2] = {0x0B0A090803020100, 0x0F0E0D0C07060504}; + vuint64m1_t m0 = vle64_v_u64m1(mdata, 2); + return v_int32x4(vreinterpret_v_i8m1_i32m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16)))); } inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } inline v_int8x16 v_pack_triplets(const v_int8x16& vec) { - vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A}; - return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16)); + uint64 mdata[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A}; + vuint64m1_t m0 = vle64_v_u64m1(mdata, 2); + return v_int8x16(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vec.val), vreinterpret_v_u64m1_u8m1(m0), 16))); } inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); } inline v_int16x8 v_pack_triplets(const v_int16x8& vec) { - vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A}; - return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16)); + uint64 mdata[2] = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A}; + vuint64m1_t m0 = vle64_v_u64m1(mdata, 2); + return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16)))); } inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } @@ -2506,7 +2856,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) { vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4); - vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2); + vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2), 2); return v_float64x2(res); } inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) @@ -2514,21 +2864,37 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, return res + c; } #endif ////// FP16 support /////// +#if __riscv_v == 7000 inline v_float32x4 v_load_expand(const float16_t* ptr) { - vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4); + vfloat16m1_t v = vle16_v_f16m1((__fp16*)ptr, 4); vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4); - return v_float32x4(vget_f32m2_f32m1(v32, 0)); + return v_float32x4(vget_v_f32m2_f32m1(v32, 0)); } inline void v_pack_store(float16_t* ptr, const v_float32x4& v) { vfloat32m2_t v32 = vundefined_f32m2(); - v32 = vset_f32m2(v32, 0, v.val); - vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4); - vse_v_f16m1((__fp16*)ptr, hv, 4); + v32 = vset_v_f32m1_f32m2(v32, 0, v.val); + vfloat16m1_t hv = vfncvt_f_f_w_f16m1(v32, 4); + vse16_v_f16m1((__fp16*)ptr, hv, 4); +} +#else +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ + vfloat16mf2_t v = vle16_v_f16mf2((__fp16*)ptr, 4); + vfloat32m1_t v32 = vfwcvt_f_f_v_f32m1(v, 4); + return v_float32x4(v32); } +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) +{ + //vfloat32m2_t v32 = vundefined_f32m2(); + //v32 = vset_f32m2(v32, 0, v.val); + vfloat16mf2_t hv = vfncvt_f_f_w_f16mf2(v.val, 4); + vse16_v_f16mf2((__fp16*)ptr, hv, 4); +} +#endif inline void v_cleanup() {} @@ -2536,5 +2902,5 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond -} // namespace cv +} #endif diff --git a/platforms/linux/riscv64-071-gcc.toolchain.cmake b/platforms/linux/riscv64-071-gcc.toolchain.cmake index 53e4a7fced..0542006570 100644 --- a/platforms/linux/riscv64-071-gcc.toolchain.cmake +++ b/platforms/linux/riscv64-071-gcc.toolchain.cmake @@ -4,5 +4,54 @@ set(CMAKE_SYSTEM_PROCESSOR riscv64) set(CMAKE_CXX_COMPILER riscv64-unknown-linux-gnu-g++) set(CMAKE_C_COMPILER riscv64-unknown-linux-gnu-gcc) -set(CMAKE_CXX_FLAGS_INIT "-march=rv64gcv -mabi=lp64d -D__riscv_vector_071") -set(CMAKE_C_FLAGS_INIT "-march=rv64gcv -mabi=lp64d -D__riscv_vector_071") +# MangoPi MQ Pro - C906FD, C906FDV +# Lichee Pi 4A - C910, C910V (?) +# CanMV K230 - C908, C908V + +# See https://github.com/T-head-Semi/gcc/blob/xuantie-gcc-10.4.0/gcc/config/riscv/riscv-cores.def + +set(_enable_vector OFF) +if(CORE STREQUAL "C906FD") + set(CMAKE_C_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d -mtune=c906fd") + set(CMAKE_CXX_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d -mtune=c906fd") +elseif(CORE STREQUAL "C906FDV") + set(CMAKE_C_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d -mtune=c906fd") + set(CMAKE_CXX_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d -mtune=c906fd") + # Disabled due to limited 64-bit SEW support + # set(_enable_vector ON) +elseif(CORE STREQUAL "C908") + set(CMAKE_C_FLAGS_INIT "-mcpu=c908 -mabi=lp64d -mtune=c908") + set(CMAKE_CXX_FLAGS_INIT "-mcpu=c908 -mabi=lp64d -mtune=c908") +elseif(CORE STREQUAL "C908V") + set(CMAKE_C_FLAGS_INIT "-mcpu=c908v -mabi=lp64d -mtune=c908") + set(CMAKE_CXX_FLAGS_INIT "-mcpu=c908v -mabi=lp64d -mtune=c908") + set(_enable_vector ON) # RVV 1.0 +elseif(CORE STREQUAL "C910") + set(CMAKE_C_FLAGS_INIT "-mcpu=c910 -mabi=lp64d -mtune=c910") + set(CMAKE_CXX_FLAGS_INIT "-mcpu=c910 -mabi=lp64d -mtune=c910") +elseif(CORE STREQUAL "C910V") + set(CMAKE_C_FLAGS_INIT "-march=rv64imafdcv0p7xthead -mabi=lp64d") + set(CMAKE_CXX_FLAGS_INIT "-march=rv64imafdcv0p7xthead -mabi=lp64d") + set(_enable_vector ON) # RVV 0.7.1 +elseif(CORE STREQUAL "C920") + set(CMAKE_C_FLAGS_INIT "-mcpu=c920 -mabi=lp64d -mtune=c920") + set(CMAKE_CXX_FLAGS_INIT "-mcpu=c920 -mabi=lp64d -mtune=c920") + set(_enable_vector ON) # RVV 0.7.1 +elseif(CORE STREQUAL "C920V2") + set(CMAKE_C_FLAGS_INIT "-mcpu=c920v2 -mabi=lp64d -mtune=c920v2") + set(CMAKE_CXX_FLAGS_INIT "-mcpu=c920v2 -mabi=lp64d -mtune=c920v2") + set(_enable_vector ON) # RVV 1.0 +else() + set(CMAKE_C_FLAGS_INIT "-march=rv64imafdc_zihintpause_zfh_zba_zbb_zbc_zbs_xtheadc -mabi=lp64d") + set(CMAKE_CXX_FLAGS_INIT "-march=rv64imafdc_zihintpause_zfh_zba_zbb_zbc_zbs_xtheadc -mabi=lp64d") +endif() + +if(_enable_vector) + set(CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS_INIT} -D__riscv_vector_071 -mrvv-vector-bits=128") + set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -D__riscv_vector_071 -mrvv-vector-bits=128") +endif() + +if(ENABLE_GCOV) + set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fprofile-arcs -ftest-coverage") + set(CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS_INIT} -fprofile-arcs -ftest-coverage") +endif() From 6b77f50269f41e1d12b0c2efcd9710779b5ed330 Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Tue, 16 Jan 2024 15:11:07 +0300 Subject: [PATCH 52/57] RISC-V: use non-saturating 64-bit add in intrin_rvv071.hpp --- modules/core/include/opencv2/core/hal/intrin_rvv071.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp index ef5f0d0ed9..e34dbc01b4 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp @@ -408,8 +408,8 @@ OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4) OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4) OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4) OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vsadd_vv_i64m1, 2) -OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vssub_vv_i64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2) OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2) OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2) OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4) From bfad61f43344f5d84bef86617bd4c1bacec0fe94 Mon Sep 17 00:00:00 2001 From: alexlyulkov Date: Thu, 18 Jan 2024 01:35:35 +0700 Subject: [PATCH 53/57] Merge pull request #24869 from alexlyulkov:al/android-camera-view-rotate Added screen rotation support to JavaCamera2View amd NativeCameraView. Fixed JavaCamera2View initialization. #24869 Added automatic image rotation to JavaCamera2View and NativeCameraView so the video preview was matched with screen orientation. Fixed double preview initialization bug in JavaCamera2View. Added proper cameraID parsing to NativeCameraView similar to JavaCameraView ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- .../org/opencv/android/JavaCamera2View.java | 47 ++++++---- .../org/opencv/android/NativeCameraView.java | 66 +++++++++++--- .../opencv/android/CameraBridgeViewBase.java | 88 +++++++++++++++++++ .../org/opencv/android/JavaCameraView.java | 88 ++++--------------- 4 files changed, 188 insertions(+), 101 deletions(-) diff --git a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java index c899389e25..6447f07b82 100644 --- a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java +++ b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java @@ -46,6 +46,7 @@ public class JavaCamera2View extends CameraBridgeViewBase { protected ImageReader mImageReader; protected int mPreviewFormat = ImageFormat.YUV_420_888; protected int mRequestTemplate = CameraDevice.TEMPLATE_PREVIEW; + private int mFrameRotation; protected CameraDevice mCameraDevice; protected CameraCaptureSession mCaptureSession; @@ -86,8 +87,8 @@ public class JavaCamera2View extends CameraBridgeViewBase { } } - protected boolean initializeCamera() { - Log.i(LOGTAG, "initializeCamera"); + protected boolean selectCamera() { + Log.i(LOGTAG, "selectCamera"); CameraManager manager = (CameraManager) getContext().getSystemService(Context.CAMERA_SERVICE); try { String camList[] = manager.getCameraIdList(); @@ -110,14 +111,10 @@ public class JavaCamera2View extends CameraBridgeViewBase { } } } - if (mCameraID != null) { - Log.i(LOGTAG, "Opening camera: " + mCameraID); - manager.openCamera(mCameraID, mStateCallback, mBackgroundHandler); - } else { // make JavaCamera2View behaves in the same way as JavaCameraView - Log.i(LOGTAG, "Trying to open camera with the value (" + mCameraIndex + ")"); + if (mCameraID == null) { // make JavaCamera2View behaves in the same way as JavaCameraView + Log.i(LOGTAG, "Selecting camera by index (" + mCameraIndex + ")"); if (mCameraIndex < camList.length) { mCameraID = camList[mCameraIndex]; - manager.openCamera(mCameraID, mStateCallback, mBackgroundHandler); } else { // CAMERA_DISCONNECTED is used when the camera id is no longer valid throw new CameraAccessException(CameraAccessException.CAMERA_DISCONNECTED); @@ -125,11 +122,11 @@ public class JavaCamera2View extends CameraBridgeViewBase { } return true; } catch (CameraAccessException e) { - Log.e(LOGTAG, "OpenCamera - Camera Access Exception", e); + Log.e(LOGTAG, "selectCamera - Camera Access Exception", e); } catch (IllegalArgumentException e) { - Log.e(LOGTAG, "OpenCamera - Illegal Argument Exception", e); + Log.e(LOGTAG, "selectCamera - Illegal Argument Exception", e); } catch (SecurityException e) { - Log.e(LOGTAG, "OpenCamera - Security Exception", e); + Log.e(LOGTAG, "selectCamera - Security Exception", e); } return false; } @@ -204,6 +201,7 @@ public class JavaCamera2View extends CameraBridgeViewBase { mImageReader.setOnImageAvailableListener(new ImageReader.OnImageAvailableListener() { @Override public void onImageAvailable(ImageReader reader) { + Image image = reader.acquireLatestImage(); if (image == null) return; @@ -213,8 +211,9 @@ public class JavaCamera2View extends CameraBridgeViewBase { assert (planes.length == 3); assert (image.getFormat() == mPreviewFormat); - JavaCamera2Frame tempFrame = new JavaCamera2Frame(image); + RotatedCameraFrame tempFrame = new RotatedCameraFrame(new JavaCamera2Frame(image), mFrameRotation); deliverAndDrawFrame(tempFrame); + tempFrame.mFrame.release(); tempFrame.release(); image.close(); } @@ -303,11 +302,22 @@ public class JavaCamera2View extends CameraBridgeViewBase { protected boolean connectCamera(int width, int height) { Log.i(LOGTAG, "setCameraPreviewSize(" + width + "x" + height + ")"); startBackgroundThread(); - initializeCamera(); + selectCamera(); try { + CameraManager manager = (CameraManager) getContext().getSystemService(Context.CAMERA_SERVICE); + CameraCharacteristics characteristics = manager.getCameraCharacteristics(mCameraID); + mFrameRotation = getFrameRotation( + characteristics.get(CameraCharacteristics.LENS_FACING) == CameraCharacteristics.LENS_FACING_FRONT, + characteristics.get(CameraCharacteristics.SENSOR_ORIENTATION)); + boolean needReconfig = calcPreviewSize(width, height); - mFrameWidth = mPreviewSize.getWidth(); - mFrameHeight = mPreviewSize.getHeight(); + if (mFrameRotation % 180 == 0) { + mFrameWidth = mPreviewSize.getWidth(); + mFrameHeight = mPreviewSize.getHeight(); + } else { + mFrameWidth = mPreviewSize.getHeight(); + mFrameHeight = mPreviewSize.getWidth(); + } if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT)) mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth); @@ -322,12 +332,16 @@ public class JavaCamera2View extends CameraBridgeViewBase { mCaptureSession.close(); mCaptureSession = null; } - createCameraPreviewSession(); } if (mFpsMeter != null) { mFpsMeter.setResolution(mFrameWidth, mFrameHeight); } + + Log.i(LOGTAG, "Opening camera: " + mCameraID); + manager.openCamera(mCameraID, mStateCallback, mBackgroundHandler); + } catch (CameraAccessException e) { + Log.e(LOGTAG, "OpenCamera - Camera Access Exception", e); } catch (RuntimeException e) { throw new RuntimeException("Interrupted while setCameraPreviewSize.", e); } @@ -442,6 +456,7 @@ public class JavaCamera2View extends CameraBridgeViewBase { mGray = new Mat(); } + @Override public void release() { mRgba.release(); mGray.release(); diff --git a/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java b/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java index 44ed8c4114..b28c2121cd 100644 --- a/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java +++ b/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java @@ -10,6 +10,7 @@ import org.opencv.videoio.VideoCapture; import org.opencv.videoio.VideoWriter; import android.content.Context; +import android.hardware.Camera; import android.util.AttributeSet; import android.util.Log; import android.view.ViewGroup.LayoutParams; @@ -25,7 +26,7 @@ public class NativeCameraView extends CameraBridgeViewBase { private Thread mThread; protected VideoCapture mCamera; - protected NativeCameraFrame mFrame; + protected RotatedCameraFrame mFrame; public NativeCameraView(Context context, int cameraId) { super(context, cameraId); @@ -89,28 +90,65 @@ public class NativeCameraView extends CameraBridgeViewBase { private boolean initializeCamera(int width, int height) { synchronized (this) { - - if (mCameraIndex == -1) { + Camera.CameraInfo cameraInfo = new Camera.CameraInfo(); + int localCameraIndex = mCameraIndex; + if (mCameraIndex == CAMERA_ID_ANY) { Log.d(TAG, "Try to open default camera"); - mCamera = new VideoCapture(0, Videoio.CAP_ANDROID); - } else { - Log.d(TAG, "Try to open camera with index " + mCameraIndex); - mCamera = new VideoCapture(mCameraIndex, Videoio.CAP_ANDROID); + localCameraIndex = 0; + } else if (mCameraIndex == CAMERA_ID_BACK) { + Log.i(TAG, "Trying to open back camera"); + for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) { + Camera.getCameraInfo( camIdx, cameraInfo ); + if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_BACK) { + localCameraIndex = camIdx; + break; + } + } + } else if (mCameraIndex == CAMERA_ID_FRONT) { + Log.i(TAG, "Trying to open front camera"); + for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) { + Camera.getCameraInfo( camIdx, cameraInfo ); + if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_FRONT) { + localCameraIndex = camIdx; + break; + } + } } + if (localCameraIndex == CAMERA_ID_BACK) { + Log.e(TAG, "Back camera not found!"); + return false; + } else if (localCameraIndex == CAMERA_ID_FRONT) { + Log.e(TAG, "Front camera not found!"); + return false; + } + + Log.d(TAG, "Try to open camera with index " + localCameraIndex); + mCamera = new VideoCapture(localCameraIndex, Videoio.CAP_ANDROID); + if (mCamera == null) return false; - if (mCamera.isOpened() == false) return false; - mFrame = new NativeCameraFrame(mCamera); + if (mCameraIndex != CAMERA_ID_BACK && mCameraIndex != CAMERA_ID_FRONT) + Camera.getCameraInfo(localCameraIndex, cameraInfo); + int frameRotation = getFrameRotation( + cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_FRONT, + cameraInfo.orientation); + + mFrame = new RotatedCameraFrame(new NativeCameraFrame(mCamera), frameRotation); mCamera.set(Videoio.CAP_PROP_FRAME_WIDTH, width); mCamera.set(Videoio.CAP_PROP_FRAME_HEIGHT, height); - mFrameWidth = (int)mCamera.get(Videoio.CAP_PROP_FRAME_WIDTH); - mFrameHeight = (int)mCamera.get(Videoio.CAP_PROP_FRAME_HEIGHT); + if (frameRotation % 180 == 0) { + mFrameWidth = (int) mCamera.get(Videoio.CAP_PROP_FRAME_WIDTH); + mFrameHeight = (int) mCamera.get(Videoio.CAP_PROP_FRAME_HEIGHT); + } else { + mFrameWidth = (int) mCamera.get(Videoio.CAP_PROP_FRAME_HEIGHT); + mFrameHeight = (int) mCamera.get(Videoio.CAP_PROP_FRAME_WIDTH); + } if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT)) mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth); @@ -131,7 +169,10 @@ public class NativeCameraView extends CameraBridgeViewBase { private void releaseCamera() { synchronized (this) { - if (mFrame != null) mFrame.release(); + if (mFrame != null) { + mFrame.mFrame.release(); + mFrame.release(); + } if (mCamera != null) mCamera.release(); } } @@ -162,6 +203,7 @@ public class NativeCameraView extends CameraBridgeViewBase { mBgr = new Mat(); } + @Override public void release() { if (mGray != null) mGray.release(); if (mRgba != null) mRgba.release(); diff --git a/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java b/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java index 1993cf1407..4aa6a350f8 100644 --- a/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java +++ b/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java @@ -4,6 +4,7 @@ import java.util.List; import org.opencv.BuildConfig; import org.opencv.R; +import org.opencv.core.Core; import org.opencv.core.Mat; import org.opencv.core.Size; @@ -17,8 +18,10 @@ import android.graphics.Canvas; import android.graphics.Rect; import android.util.AttributeSet; import android.util.Log; +import android.view.Surface; import android.view.SurfaceHolder; import android.view.SurfaceView; +import android.view.WindowManager; /** * This is a basic class, implementing the interaction with Camera and OpenCV library. @@ -189,8 +192,93 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac * This method returns single channel gray scale Mat with frame */ public Mat gray(); + + public void release(); }; + public class RotatedCameraFrame implements CvCameraViewFrame { + @Override + public Mat gray() { + if (mRotation != 0) { + Core.rotate(mFrame.gray(), mGrayRotated, getCvRotationCode(mRotation)); + return mGrayRotated; + } else { + return mFrame.gray(); + } + } + + @Override + public Mat rgba() { + if (mRotation != 0) { + Core.rotate(mFrame.rgba(), mRgbaRotated, getCvRotationCode(mRotation)); + return mRgbaRotated; + } else { + return mFrame.rgba(); + } + } + + private int getCvRotationCode(int degrees) { + if (degrees == 90) { + return Core.ROTATE_90_CLOCKWISE; + } else if (degrees == 180) { + return Core.ROTATE_180; + } else { + return Core.ROTATE_90_COUNTERCLOCKWISE; + } + } + + public RotatedCameraFrame(CvCameraViewFrame frame, int rotation) { + super(); + mFrame = frame; + mRgbaRotated = new Mat(); + mGrayRotated = new Mat(); + mRotation = rotation; + } + + @Override + public void release() { + mRgbaRotated.release(); + mGrayRotated.release(); + } + + public CvCameraViewFrame mFrame; + private Mat mRgbaRotated; + private Mat mGrayRotated; + private int mRotation; + }; + + /** + * Calculates how to rotate camera frame to match current screen orientation + */ + protected int getFrameRotation(boolean cameraFacingFront, int cameraSensorOrientation) { + WindowManager windowManager = (WindowManager) getContext().getSystemService(Context.WINDOW_SERVICE); + int screenOrientation = windowManager.getDefaultDisplay().getRotation(); + int screenRotation = 0; + switch (screenOrientation) { + case Surface.ROTATION_0: + screenRotation = 0; + break; + case Surface.ROTATION_90: + screenRotation = 90; + break; + case Surface.ROTATION_180: + screenRotation = 180; + break; + case Surface.ROTATION_270: + screenRotation = 270; + break; + } + + int frameRotation; + if (cameraFacingFront) { + frameRotation = (cameraSensorOrientation + screenRotation) % 360; + } else { + frameRotation = (cameraSensorOrientation - screenRotation + 360) % 360; + } + + return frameRotation; + } + public void surfaceChanged(SurfaceHolder arg0, int arg1, int arg2, int arg3) { Log.d(TAG, "call surfaceChanged event"); synchronized(mSyncObject) { diff --git a/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java b/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java index 1c10c3cb12..b76f186101 100644 --- a/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java +++ b/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java @@ -42,7 +42,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb private boolean mStopThread; protected Camera mCamera; - protected JavaCameraFrame[] mCameraFrame; + protected RotatedCameraFrame[] mCameraFrame; private SurfaceTexture mSurfaceTexture; private int mPreviewFormat = ImageFormat.NV21; @@ -132,7 +132,11 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb if (mCamera == null) return false; - int frameRotation = getFrameRotation(cameraId); + android.hardware.Camera.CameraInfo info = new android.hardware.Camera.CameraInfo(); + android.hardware.Camera.getCameraInfo(cameraId, info); + int frameRotation = getFrameRotation( + info.facing == Camera.CameraInfo.CAMERA_FACING_FRONT, + info.orientation); /* Now set camera parameters */ try { Camera.Parameters params = mCamera.getParameters(); @@ -206,9 +210,9 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb AllocateCache(); - mCameraFrame = new JavaCameraFrame[2]; - mCameraFrame[0] = new JavaCameraFrame(mFrameChain[0], rawFrameWidth, rawFrameHeight, frameRotation); - mCameraFrame[1] = new JavaCameraFrame(mFrameChain[1], rawFrameWidth, rawFrameHeight, frameRotation); + mCameraFrame = new RotatedCameraFrame[2]; + mCameraFrame[0] = new RotatedCameraFrame(new JavaCameraFrame(mFrameChain[0], rawFrameWidth, rawFrameHeight), frameRotation); + mCameraFrame[1] = new RotatedCameraFrame(new JavaCameraFrame(mFrameChain[1], rawFrameWidth, rawFrameHeight), frameRotation); if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.HONEYCOMB) { mSurfaceTexture = new SurfaceTexture(MAGIC_TEXTURE_ID); @@ -245,7 +249,9 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb mFrameChain[1].release(); } if (mCameraFrame != null) { + mCameraFrame[0].mFrame.release(); mCameraFrame[0].release(); + mCameraFrame[1].mFrame.release(); mCameraFrame[1].release(); } } @@ -318,14 +324,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb private class JavaCameraFrame implements CvCameraViewFrame { @Override public Mat gray() { - mGray = mYuvFrameData.submat(0, mHeight, 0, mWidth); - - if (mRotation != 0) { - Core.rotate(mGray, mGrayRotated, getCvRotationCode(mRotation)); - return mGrayRotated; - } else { - return mGray; - } + return mYuvFrameData.submat(0, mHeight, 0, mWidth); } @Override @@ -337,85 +336,28 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb else throw new IllegalArgumentException("Preview Format can be NV21 or YV12"); - if (mRotation != 0) { - Core.rotate(mRgba, mRgbaRotated, getCvRotationCode(mRotation)); - return mRgbaRotated; - } else { - return mRgba; - } + return mRgba; } - private int getCvRotationCode(int degrees) { - if (degrees == 90) { - return Core.ROTATE_90_CLOCKWISE; - } else if (degrees == 180) { - return Core.ROTATE_180; - } else { - return Core.ROTATE_90_COUNTERCLOCKWISE; - } - } - - public JavaCameraFrame(Mat Yuv420sp, int width, int height, int rotation) { + public JavaCameraFrame(Mat Yuv420sp, int width, int height) { super(); mWidth = width; mHeight = height; mYuvFrameData = Yuv420sp; mRgba = new Mat(); - mRgbaRotated = new Mat(); - mGrayRotated = new Mat(); - mRotation = rotation; } + @Override public void release() { mRgba.release(); } private Mat mYuvFrameData; private Mat mRgba; - private Mat mRgbaRotated; - private Mat mGray; - private Mat mGrayRotated; private int mWidth; private int mHeight; - private int mRotation; }; - /** - * Calculates how to rotate camera frame to match current screen orientation - */ - private int getFrameRotation(int cameraId) { - WindowManager windowManager = (WindowManager) getContext().getSystemService(Context.WINDOW_SERVICE); - int screenOrientation = windowManager.getDefaultDisplay().getRotation(); - int screenRotation = 0; - switch (screenOrientation) { - case Surface.ROTATION_0: - screenRotation = 0; - break; - case Surface.ROTATION_90: - screenRotation = 90; - break; - case Surface.ROTATION_180: - screenRotation = 180; - break; - case Surface.ROTATION_270: - screenRotation = 270; - break; - } - - android.hardware.Camera.CameraInfo info = new android.hardware.Camera.CameraInfo(); - android.hardware.Camera.getCameraInfo(cameraId, info); - - int frameRotation; - if (info.facing == Camera.CameraInfo.CAMERA_FACING_FRONT) { - frameRotation = (info.orientation + screenRotation) % 360; - frameRotation = (360 - frameRotation) % 360; - } else { - frameRotation = (info.orientation - screenRotation + 360) % 360; - } - - return frameRotation; - } - private class CameraWorker implements Runnable { @Override From d269de0a03bcf198cabc10b5164adc3850a1d2aa Mon Sep 17 00:00:00 2001 From: fengyuentau Date: Thu, 18 Jan 2024 11:17:50 +0800 Subject: [PATCH 54/57] initial commit --- modules/dnn/src/layers/scatterND_layer.cpp | 5 +++++ modules/dnn/src/layers/scatter_layer.cpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/modules/dnn/src/layers/scatterND_layer.cpp b/modules/dnn/src/layers/scatterND_layer.cpp index 531d32f45b..0ab02146cb 100644 --- a/modules/dnn/src/layers/scatterND_layer.cpp +++ b/modules/dnn/src/layers/scatterND_layer.cpp @@ -74,6 +74,11 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + if (inputs_arr.depth() == CV_16S) { + forward_fallback(inputs_arr, outputs_arr, internals_arr); + return; + } + std::vector inputs, outputs; inputs_arr.getMatVector(inputs); outputs_arr.getMatVector(outputs); diff --git a/modules/dnn/src/layers/scatter_layer.cpp b/modules/dnn/src/layers/scatter_layer.cpp index 3b803b16c1..24e4b54bc8 100644 --- a/modules/dnn/src/layers/scatter_layer.cpp +++ b/modules/dnn/src/layers/scatter_layer.cpp @@ -68,6 +68,11 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + if (inputs_arr.depth() == CV_16S) { + forward_fallback(inputs_arr, outputs_arr, internals_arr); + return; + } + std::vector inputs, outputs; inputs_arr.getMatVector(inputs); outputs_arr.getMatVector(outputs); From e64857c5611d5898b7b30640a775331488a5ebef Mon Sep 17 00:00:00 2001 From: Sean McBride Date: Fri, 19 Jan 2024 08:53:08 -0500 Subject: [PATCH 55/57] Merge pull request #23736 from seanm:c++11-simplifications Removed all pre-C++11 code, workarounds, and branches #23736 This removes a bunch of pre-C++11 workrarounds that are no longer necessary as C++11 is now required. It is a nice clean up and simplification. * No longer unconditionally #include in cvdef.h, include explicitly where needed * Removed deprecated CV_NODISCARD, already unused in the codebase * Removed some pre-C++11 workarounds, and simplified some backwards compat defines * Removed CV_CXX_STD_ARRAY * Removed CV_CXX_MOVE_SEMANTICS and CV_CXX_MOVE * Removed all tests of CV_CXX11, now assume it's always true. This allowed removing a lot of dead code. * Updated some documentation consequently. * Removed all tests of CV_CXX11, now assume it's always true * Fixed links. --------- Co-authored-by: Maksim Shabunin Co-authored-by: Alexander Smorkalov --- .../how_to_use_OpenCV_parallel_for_.markdown | 19 ++-- modules/core/include/opencv2/core/async.hpp | 4 - modules/core/include/opencv2/core/cvdef.h | 94 ++++--------------- .../opencv2/core/detail/async_promise.hpp | 2 - .../opencv2/core/detail/exception_ptr.hpp | 8 +- modules/core/include/opencv2/core/eigen.hpp | 3 +- modules/core/include/opencv2/core/mat.hpp | 1 + modules/core/include/opencv2/core/matx.hpp | 4 - .../core/utils/allocator_stats.impl.hpp | 56 +---------- modules/core/src/async.cpp | 33 ------- modules/core/src/matrix_wrap.cpp | 10 -- modules/core/src/parallel.cpp | 3 +- modules/core/src/system.cpp | 39 +------- modules/core/test/test_async.cpp | 4 +- modules/core/test/test_misc.cpp | 4 - modules/core/test/test_precomp.hpp | 2 + modules/core/test/test_utils_tls.impl.hpp | 6 -- modules/dnn/src/net_impl.cpp | 4 - modules/dnn/src/net_impl.hpp | 2 - modules/dnn/src/onnx/onnx_importer.cpp | 1 + modules/dnn/test/test_misc.cpp | 4 - modules/imgcodecs/src/loadsave.cpp | 7 -- modules/objdetect/src/precomp.hpp | 2 + modules/objdetect/src/qrcode.cpp | 1 + modules/objdetect/test/test_precomp.hpp | 6 +- modules/objdetect/test/test_qrcode_encode.cpp | 10 -- modules/python/src2/cv2_util.cpp | 4 - modules/python/src2/hdr_parser.py | 3 +- modules/stitching/src/precomp.hpp | 1 + modules/ts/include/opencv2/ts.hpp | 4 - modules/videoio/src/cap_mfx_common.hpp | 15 --- modules/videoio/src/cap_msmf.cpp | 1 + .../how_to_use_OpenCV_parallel_for_.cpp | 12 ++- .../how_to_use_OpenCV_parallel_for_new.cpp | 12 ++- .../mat_the_basic_image_container.cpp | 4 +- samples/dnn/object_detection.cpp | 2 +- 36 files changed, 68 insertions(+), 319 deletions(-) diff --git a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown index 4c68efecd0..ab24d27ab1 100644 --- a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown +++ b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown @@ -9,6 +9,9 @@ How to use the OpenCV parallel_for_ to parallelize your code {#tutorial_how_to_u | -: | :- | | Compatibility | OpenCV >= 3.0 | + +@note See also C++ lambda usage with parallel for in [tuturial](@ref tutorial_how_to_use_OpenCV_parallel_for_new). + Goal ---- @@ -20,7 +23,7 @@ If you want more information about multithreading, you will have to refer to a r to remain simple. Precondition ----- +------------ The first precondition is to have OpenCV built with a parallel framework. In OpenCV 3.2, the following parallel frameworks are available in that order: @@ -50,7 +53,7 @@ We will use the example of drawing a Mandelbrot set to show how from a regular s the code to parallelize the computation. Theory ------------ +------ The Mandelbrot set definition has been named in tribute to the mathematician Benoit Mandelbrot by the mathematician Adrien Douady. It has been famous outside of the mathematics field as the image representation is an example of a @@ -69,7 +72,7 @@ Here, we will just introduce the formula to draw the Mandelbrot set (from the me > \f[\limsup_{n\to\infty}|z_{n+1}|\leqslant2\f] Pseudocode ------------ +---------- A simple algorithm to generate a representation of the Mandelbrot set is called the ["escape time algorithm"](https://en.wikipedia.org/wiki/Mandelbrot_set#Escape_time_algorithm). @@ -110,10 +113,10 @@ On this figure, we recall that the real part of a complex number is on the x-axi You can see that the whole shape can be repeatedly visible if we zoom at particular locations. Implementation ------------ +-------------- Escape time algorithm implementation --------------------------- +------------------------------------ @snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-escape-time-algorithm @@ -121,7 +124,7 @@ Here, we used the [`std::complex`](http://en.cppreference.com/w/cpp/numeric/comp complex number. This function performs the test to check if the pixel is in set or not and returns the "escaped" iteration. Sequential Mandelbrot implementation --------------------------- +------------------------------------ @snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-sequential @@ -149,7 +152,7 @@ The green curve corresponds to a simple linear scale transformation, the blue on and you can observe how the lowest values will be boosted when looking at the slope at these positions. Parallel Mandelbrot implementation --------------------------- +---------------------------------- When looking at the sequential implementation, we can notice that each pixel is computed independently. To optimize the computation, we can perform multiple pixel calculations in parallel, by exploiting the multi-core architecture of modern @@ -181,7 +184,7 @@ C++ 11 standard allows to simplify the parallel implementation by get rid of the @snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-parallel-call-cxx11 Results ------------ +------- You can find the full tutorial code [here](https://github.com/opencv/opencv/blob/4.x/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp). The performance of the parallel implementation depends of the type of CPU you have. For instance, on 4 cores / 8 threads diff --git a/modules/core/include/opencv2/core/async.hpp b/modules/core/include/opencv2/core/async.hpp index 54560c7d00..98868a130b 100644 --- a/modules/core/include/opencv2/core/async.hpp +++ b/modules/core/include/opencv2/core/async.hpp @@ -7,10 +7,8 @@ #include -#ifdef CV_CXX11 //#include #include -#endif namespace cv { @@ -69,7 +67,6 @@ public: CV_WRAP bool valid() const CV_NOEXCEPT; -#ifdef CV_CXX11 inline AsyncArray(AsyncArray&& o) { p = o.p; o.p = NULL; } inline AsyncArray& operator=(AsyncArray&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; } @@ -89,7 +86,6 @@ public: std::future getFutureMat() const; std::future getFutureUMat() const; #endif -#endif // PImpl diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index b55ac3b4ba..b6879ff5a5 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -752,89 +752,44 @@ __CV_ENUM_FLAGS_BITWISE_XOR_EQ (EnumType, EnumType) #endif -/****************************************************************************************\ -* CV_NODISCARD attribute (deprecated, GCC only) * -* DONT USE: use instead the standard CV_NODISCARD_STD macro above * -* this legacy method silently fails to issue warning until some version * -* after gcc 6.3.0. Yet with gcc 7+ you can use the above standard method * -* which makes this method useless. Don't use it. * -* @deprecated use instead CV_NODISCARD_STD * -\****************************************************************************************/ -#ifndef CV_NODISCARD -# if defined(__GNUC__) -# define CV_NODISCARD __attribute__((__warn_unused_result__)) -# elif defined(__clang__) && defined(__has_attribute) -# if __has_attribute(__warn_unused_result__) -# define CV_NODISCARD __attribute__((__warn_unused_result__)) -# endif -# endif -#endif -#ifndef CV_NODISCARD -# define CV_NODISCARD /* nothing by default */ -#endif - - /****************************************************************************************\ * C++ 11 * \****************************************************************************************/ -#ifndef CV_CXX11 -# if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1800) -# define CV_CXX11 1 +#ifdef __cplusplus +// MSVC was stuck at __cplusplus == 199711L for a long time, even where it supports C++11, +// so check _MSC_VER instead. See: +// +# if defined(_MSC_VER) +# if _MSC_VER < 1800 +# error "OpenCV 4.x+ requires enabled C++11 support" +# endif +# elif __cplusplus < 201103L +# error "OpenCV 4.x+ requires enabled C++11 support" # endif -#else -# if CV_CXX11 == 0 -# undef CV_CXX11 -# endif -#endif -#ifndef CV_CXX11 -# error "OpenCV 4.x+ requires enabled C++11 support" #endif -#define CV_CXX_MOVE_SEMANTICS 1 -#define CV_CXX_MOVE(x) std::move(x) -#define CV_CXX_STD_ARRAY 1 -#include +#ifndef CV_CXX11 +# define CV_CXX11 1 +#endif + #ifndef CV_OVERRIDE # define CV_OVERRIDE override #endif + #ifndef CV_FINAL # define CV_FINAL final #endif #ifndef CV_NOEXCEPT -# if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/) -# define CV_NOEXCEPT noexcept -# endif -#endif -#ifndef CV_NOEXCEPT -# define CV_NOEXCEPT +# define CV_NOEXCEPT noexcept #endif #ifndef CV_CONSTEXPR -# if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/) -# define CV_CONSTEXPR constexpr -# endif -#endif -#ifndef CV_CONSTEXPR -# define CV_CONSTEXPR +# define CV_CONSTEXPR constexpr #endif // Integer types portability -#ifdef OPENCV_STDINT_HEADER -#include OPENCV_STDINT_HEADER -#elif defined(__cplusplus) -#if defined(_MSC_VER) && _MSC_VER < 1600 /* MSVS 2010 */ -namespace cv { -typedef signed char int8_t; -typedef unsigned char uint8_t; -typedef signed short int16_t; -typedef unsigned short uint16_t; -typedef signed int int32_t; -typedef unsigned int uint32_t; -typedef signed __int64 int64_t; -typedef unsigned __int64 uint64_t; -} -#elif defined(_MSC_VER) || __cplusplus >= 201103L +#ifdef __cplusplus #include namespace cv { using std::int8_t; @@ -846,19 +801,6 @@ using std::uint32_t; using std::int64_t; using std::uint64_t; } -#else -#include -namespace cv { -typedef ::int8_t int8_t; -typedef ::uint8_t uint8_t; -typedef ::int16_t int16_t; -typedef ::uint16_t uint16_t; -typedef ::int32_t int32_t; -typedef ::uint32_t uint32_t; -typedef ::int64_t int64_t; -typedef ::uint64_t uint64_t; -} -#endif #else // pure C #include #endif diff --git a/modules/core/include/opencv2/core/detail/async_promise.hpp b/modules/core/include/opencv2/core/detail/async_promise.hpp index 6eb3fb52c1..c039ec046a 100644 --- a/modules/core/include/opencv2/core/detail/async_promise.hpp +++ b/modules/core/include/opencv2/core/detail/async_promise.hpp @@ -52,10 +52,8 @@ public: */ void setException(const cv::Exception& exception); -#ifdef CV_CXX11 explicit AsyncPromise(AsyncPromise&& o) { p = o.p; o.p = NULL; } AsyncPromise& operator=(AsyncPromise&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; } -#endif // PImpl diff --git a/modules/core/include/opencv2/core/detail/exception_ptr.hpp b/modules/core/include/opencv2/core/detail/exception_ptr.hpp index d98ffc40c6..a1a591e455 100644 --- a/modules/core/include/opencv2/core/detail/exception_ptr.hpp +++ b/modules/core/include/opencv2/core/detail/exception_ptr.hpp @@ -8,14 +8,8 @@ #ifndef CV__EXCEPTION_PTR # if defined(__ANDROID__) && defined(ATOMIC_INT_LOCK_FREE) && ATOMIC_INT_LOCK_FREE < 2 # define CV__EXCEPTION_PTR 0 // Not supported, details: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58938 -# elif defined(CV_CXX11) +# else # define CV__EXCEPTION_PTR 1 -# elif defined(_MSC_VER) -# define CV__EXCEPTION_PTR (_MSC_VER >= 1600) -# elif defined(__clang__) -# define CV__EXCEPTION_PTR 0 // C++11 only (see above) -# elif defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__) -# define CV__EXCEPTION_PTR (__GXX_EXPERIMENTAL_CXX0X__ > 0) # endif #endif #ifndef CV__EXCEPTION_PTR diff --git a/modules/core/include/opencv2/core/eigen.hpp b/modules/core/include/opencv2/core/eigen.hpp index f176409cc3..231c6805c0 100644 --- a/modules/core/include/opencv2/core/eigen.hpp +++ b/modules/core/include/opencv2/core/eigen.hpp @@ -61,8 +61,7 @@ #endif #if !defined(OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT) -#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3 \ - && defined(CV_CXX11) && defined(CV_CXX_STD_ARRAY) +#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3 #include #define OPENCV_EIGEN_TENSOR_SUPPORT 1 #endif // EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3 diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp index c4c6ff6d78..2bfb0966c2 100644 --- a/modules/core/include/opencv2/core/mat.hpp +++ b/modules/core/include/opencv2/core/mat.hpp @@ -53,6 +53,7 @@ #include "opencv2/core/bufferpool.hpp" +#include #include namespace cv diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp index c5e769f0ab..b4b2e7ef17 100644 --- a/modules/core/include/opencv2/core/matx.hpp +++ b/modules/core/include/opencv2/core/matx.hpp @@ -376,10 +376,8 @@ public: static Vec randn(_Tp a, _Tp b); static Vec randu(_Tp a, _Tp b); static Vec zeros(); -#ifdef CV_CXX11 static Vec diag(_Tp alpha) = delete; static Vec eye() = delete; -#endif //! per-element multiplication Vec mul(const Vec<_Tp, cn>& v) const; @@ -402,9 +400,7 @@ public: const _Tp& operator ()(int i) const; _Tp& operator ()(int i); -#ifdef CV_CXX11 Vec<_Tp, cn>& operator=(const Vec<_Tp, cn>& rhs) = default; -#endif Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp); Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp); diff --git a/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp b/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp index eb5ecde16b..bbc6cf8979 100644 --- a/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp +++ b/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp @@ -9,8 +9,6 @@ //#define OPENCV_DISABLE_ALLOCATOR_STATS -#ifdef CV_CXX11 - #include #ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE @@ -26,14 +24,6 @@ #define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE long long #endif -#else // CV_CXX11 - -#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE -#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int // CV_XADD supports int only -#endif - -#endif // CV_CXX11 - namespace cv { namespace utils { #ifdef CV__ALLOCATOR_STATS_LOG @@ -59,7 +49,7 @@ public: void onAllocate(size_t /*sz*/) {} void onFree(size_t /*sz*/) {} -#elif defined(CV_CXX11) +#else protected: typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t; @@ -104,49 +94,7 @@ public: #endif curr -= (counter_t)sz; } - -#else // non C++11 - -protected: - typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t; - volatile counter_t curr, total, total_allocs, peak; // overflow is possible, CV_XADD operates with 'int' only -public: - AllocatorStatistics() - : curr(0), total(0), total_allocs(0), peak(0) - {} - ~AllocatorStatistics() CV_OVERRIDE {} - - uint64_t getCurrentUsage() const CV_OVERRIDE { return (uint64_t)curr; } - uint64_t getTotalUsage() const CV_OVERRIDE { return (uint64_t)total; } - uint64_t getNumberOfAllocations() const CV_OVERRIDE { return (uint64_t)total_allocs; } - uint64_t getPeakUsage() const CV_OVERRIDE { return (uint64_t)peak; } - - void resetPeakUsage() CV_OVERRIDE { peak = curr; } - - // Controller interface - void onAllocate(size_t sz) - { -#ifdef CV__ALLOCATOR_STATS_LOG - CV__ALLOCATOR_STATS_LOG(cv::format("allocate: %lld (curr=%lld)", (long long int)sz, (long long int)curr)); -#endif - - counter_t new_curr = (counter_t)CV_XADD(&curr, (counter_t)sz) + (counter_t)sz; - - peak = std::max((counter_t)peak, new_curr); // non-thread safe - - //CV_XADD(&total, (uint64_t)sz); // overflow with int, non-reliable... - total += sz; - - CV_XADD(&total_allocs, (counter_t)1); - } - void onFree(size_t sz) - { -#ifdef CV__ALLOCATOR_STATS_LOG - CV__ALLOCATOR_STATS_LOG(cv::format("free: %lld (curr=%lld)", (long long int)sz, (long long int)curr)); -#endif - CV_XADD(&curr, (counter_t)-sz); - } -#endif +#endif // OPENCV_DISABLE_ALLOCATOR_STATS }; #ifdef CV__ALLOCATOR_STATS_LOG diff --git a/modules/core/src/async.cpp b/modules/core/src/async.cpp index 78c0a1ee81..3aeaaf7394 100644 --- a/modules/core/src/async.cpp +++ b/modules/core/src/async.cpp @@ -3,7 +3,6 @@ // of this distribution and at http://opencv.org/license.html. #include "precomp.hpp" -//#undef CV_CXX11 // debug non C++11 mode #include "opencv2/core/async.hpp" #include "opencv2/core/detail/async_promise.hpp" @@ -16,11 +15,9 @@ #ifndef OPENCV_DISABLE_THREAD_SUPPORT -#ifdef CV_CXX11 #include #include #include -#endif namespace cv { @@ -37,12 +34,8 @@ struct AsyncArray::Impl void releasePromise() CV_NOEXCEPT { CV_XADD(&refcount_promise, -1); if(1 == CV_XADD(&refcount, -1)) delete this; } \ int refcount_promise; -#ifdef CV_CXX11 mutable std::mutex mtx; mutable std::condition_variable cond_var; -#else - mutable cv::Mutex mtx; -#endif mutable bool has_result; // Mat, UMat or exception @@ -88,11 +81,7 @@ struct AsyncArray::Impl if (!wait_for(timeoutNs)) return false; } -#ifdef CV_CXX11 std::unique_lock lock(mtx); -#else - cv::AutoLock lock(mtx); -#endif if (has_result) { if (!result_mat.empty()) @@ -145,7 +134,6 @@ struct AsyncArray::Impl if (timeoutNs == 0) return has_result; CV_LOG_INFO(NULL, "Waiting for async result ..."); -#ifdef CV_CXX11 std::unique_lock lock(mtx); const auto cond_pred = [&]{ return has_result == true; }; if (timeoutNs > 0) @@ -156,9 +144,6 @@ struct AsyncArray::Impl CV_Assert(has_result); return true; } -#else - CV_Error(Error::StsNotImplemented, "OpenCV has been built without async waiting support (C++11 is required)"); -#endif } AsyncArray getArrayResult() @@ -175,11 +160,7 @@ struct AsyncArray::Impl { if (future_is_returned && refcount_future == 0) CV_Error(Error::StsError, "Associated AsyncArray has been destroyed"); -#ifdef CV_CXX11 std::unique_lock lock(mtx); -#else - cv::AutoLock lock(mtx); -#endif CV_Assert(!has_result); int k = value.kind(); if (k == _InputArray::UMAT) @@ -193,9 +174,7 @@ struct AsyncArray::Impl value.copyTo(*result_mat.get()); } has_result = true; -#ifdef CV_CXX11 cond_var.notify_all(); -#endif } #if CV__EXCEPTION_PTR @@ -203,18 +182,12 @@ struct AsyncArray::Impl { if (future_is_returned && refcount_future == 0) CV_Error(Error::StsError, "Associated AsyncArray has been destroyed"); -#ifdef CV_CXX11 std::unique_lock lock(mtx); -#else - cv::AutoLock lock(mtx); -#endif CV_Assert(!has_result); has_exception = true; exception = e; has_result = true; -#ifdef CV_CXX11 cond_var.notify_all(); -#endif } #endif @@ -222,18 +195,12 @@ struct AsyncArray::Impl { if (future_is_returned && refcount_future == 0) CV_Error(Error::StsError, "Associated AsyncArray has been destroyed"); -#ifdef CV_CXX11 std::unique_lock lock(mtx); -#else - cv::AutoLock lock(mtx); -#endif CV_Assert(!has_result); has_exception = true; cv_exception = e; has_result = true; -#ifdef CV_CXX11 cond_var.notify_all(); -#endif } }; diff --git a/modules/core/src/matrix_wrap.cpp b/modules/core/src/matrix_wrap.cpp index bb61ce2de1..fa9a23ee2a 100644 --- a/modules/core/src/matrix_wrap.cpp +++ b/modules/core/src/matrix_wrap.cpp @@ -1919,12 +1919,7 @@ void _OutputArray::move(UMat& u) const int k = kind(); if (k == UMAT) { -#ifdef CV_CXX11 *(UMat*)obj = std::move(u); -#else - *(UMat*)obj = u; - u.release(); -#endif } else if (k == MAT) { @@ -1959,12 +1954,7 @@ void _OutputArray::move(Mat& m) const } else if (k == MAT) { -#ifdef CV_CXX11 *(Mat*)obj = std::move(m); -#else - *(Mat*)obj = m; - m.release(); -#endif } else if (k == MATX) { diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp index 4525928b94..84e94da877 100644 --- a/modules/core/src/parallel.cpp +++ b/modules/core/src/parallel.cpp @@ -912,8 +912,7 @@ int getNumberOfCPUs_() * the minimum most value as it has high probablity of being right and safe. * Return 1 if we get 0 or not found on all methods. */ -#if defined CV_CXX11 \ - && !defined(__MINGW32__) /* not implemented (2020-03) */ \ +#if !defined(__MINGW32__) /* not implemented (2020-03) */ /* * Check for this standard C++11 way, we do not return directly because diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 0a6e2f5037..b9d3ecaa1a 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -305,9 +305,7 @@ DECLARE_CV_CPUID_X86 #endif #endif -#if defined CV_CXX11 - #include -#endif +#include namespace cv { @@ -909,50 +907,15 @@ bool useOptimized(void) int64 getTickCount(void) { -#if defined CV_CXX11 std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now(); return (int64)now.time_since_epoch().count(); -#elif defined _WIN32 || defined WINCE - LARGE_INTEGER counter; - QueryPerformanceCounter( &counter ); - return (int64)counter.QuadPart; -#elif defined __MACH__ && defined __APPLE__ - return (int64)mach_absolute_time(); -#elif defined __unix__ - struct timespec tp; - clock_gettime(CLOCK_MONOTONIC, &tp); - return (int64)tp.tv_sec*1000000000 + tp.tv_nsec; -#else - struct timeval tv; - gettimeofday(&tv, NULL); - return (int64)tv.tv_sec*1000000 + tv.tv_usec; -#endif } double getTickFrequency(void) { -#if defined CV_CXX11 using clock_period_t = std::chrono::steady_clock::duration::period; double clock_freq = clock_period_t::den / clock_period_t::num; return clock_freq; -#elif defined _WIN32 || defined WINCE - LARGE_INTEGER freq; - QueryPerformanceFrequency(&freq); - return (double)freq.QuadPart; -#elif defined __MACH__ && defined __APPLE__ - static double freq = 0; - if( freq == 0 ) - { - mach_timebase_info_data_t sTimebaseInfo; - mach_timebase_info(&sTimebaseInfo); - freq = sTimebaseInfo.denom*1e9/sTimebaseInfo.numer; - } - return freq; -#elif defined __unix__ - return 1e9; -#else - return 1e6; -#endif } #if defined __GNUC__ && (defined __i386__ || defined __x86_64__ || defined __ppc__) diff --git a/modules/core/test/test_async.cpp b/modules/core/test/test_async.cpp index 58bcfddcd7..2fcee300cf 100644 --- a/modules/core/test/test_async.cpp +++ b/modules/core/test/test_async.cpp @@ -7,7 +7,7 @@ #include -#if defined(CV_CXX11) && !defined(OPENCV_DISABLE_THREAD_SUPPORT) +#if !defined(OPENCV_DISABLE_THREAD_SUPPORT) #include #include #endif @@ -85,7 +85,7 @@ TEST(Core_Async, LikePythonTest) } -#if defined(CV_CXX11) && !defined(OPENCV_DISABLE_THREAD_SUPPORT) +#if !defined(OPENCV_DISABLE_THREAD_SUPPORT) TEST(Core_Async, AsyncThread_Simple) { diff --git a/modules/core/test/test_misc.cpp b/modules/core/test/test_misc.cpp index 6f50600f40..39d0788d64 100644 --- a/modules/core/test/test_misc.cpp +++ b/modules/core/test/test_misc.cpp @@ -8,10 +8,8 @@ #include -#ifdef CV_CXX11 #include #include -#endif namespace opencv_test { namespace { @@ -282,9 +280,7 @@ public: // FP state is not supported // no checks } -#ifdef CV_CXX11 std::this_thread::sleep_for(std::chrono::milliseconds(100)); -#endif } cv::details::FPDenormalsModeState base_state; diff --git a/modules/core/test/test_precomp.hpp b/modules/core/test/test_precomp.hpp index 81ddf45de9..3d9e5a9f39 100644 --- a/modules/core/test/test_precomp.hpp +++ b/modules/core/test/test_precomp.hpp @@ -4,6 +4,8 @@ #ifndef __OPENCV_TEST_PRECOMP_HPP__ #define __OPENCV_TEST_PRECOMP_HPP__ +#include + #include "opencv2/ts.hpp" #include "opencv2/ts/ocl_test.hpp" #include "opencv2/core/private.hpp" diff --git a/modules/core/test/test_utils_tls.impl.hpp b/modules/core/test/test_utils_tls.impl.hpp index 36b8805422..20facabadd 100644 --- a/modules/core/test/test_utils_tls.impl.hpp +++ b/modules/core/test/test_utils_tls.impl.hpp @@ -4,9 +4,7 @@ // This is .hpp file included from test_utils.cpp -#ifdef CV_CXX11 #include // std::thread -#endif #include "opencv2/core/utils/tls.hpp" @@ -34,8 +32,6 @@ public: int TLSReporter::g_last_id = 0; int TLSReporter::g_allocated = 0; -#ifdef CV_CXX11 - template static void callNThreadsWithTLS(int N, TLSData& tls) { @@ -129,6 +125,4 @@ static void testTLSAccumulator(bool detachFirst) TEST(Core_TLS, AccumulatorHoldData_detachData) { testTLSAccumulator(true); } TEST(Core_TLS, AccumulatorHoldData_gather) { testTLSAccumulator(false); } -#endif - }} // namespace diff --git a/modules/dnn/src/net_impl.cpp b/modules/dnn/src/net_impl.cpp index 16ae6d7bfb..09258642f9 100644 --- a/modules/dnn/src/net_impl.cpp +++ b/modules/dnn/src/net_impl.cpp @@ -918,7 +918,6 @@ AsyncArray Net::Impl::forwardAsync(const String& outputName) CV_Assert(!empty()); FPDenormalsIgnoreHintScope fp_denormals_ignore_scope; -#ifdef CV_CXX11 String layerName = outputName; if (layerName.empty()) @@ -939,9 +938,6 @@ AsyncArray Net::Impl::forwardAsync(const String& outputName) isAsync = false; return getBlobAsync(layerName); -#else - CV_Error(Error::StsNotImplemented, "DNN: Asynchronous forward requires build with enabled C++11"); -#endif // CV_CXX11 } diff --git a/modules/dnn/src/net_impl.hpp b/modules/dnn/src/net_impl.hpp index d935655c4a..a11db8fb30 100644 --- a/modules/dnn/src/net_impl.hpp +++ b/modules/dnn/src/net_impl.hpp @@ -273,11 +273,9 @@ struct Net::Impl : public detail::NetImplBase Mat getBlob(String outputName) const; -#ifdef CV_CXX11 virtual AsyncArray getBlobAsync(const LayerPin& pin); AsyncArray getBlobAsync(String outputName); -#endif // CV_CXX11 string dump(bool forceAllocation = false) const; diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp index 5afff7db00..72b93dfef3 100644 --- a/modules/dnn/src/onnx/onnx_importer.cpp +++ b/modules/dnn/src/onnx/onnx_importer.cpp @@ -22,6 +22,7 @@ #ifdef HAVE_PROTOBUF +#include #include #include #include diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp index 12d3964196..b4b691e318 100644 --- a/modules/dnn/test/test_misc.cpp +++ b/modules/dnn/test/test_misc.cpp @@ -1035,14 +1035,10 @@ TEST_P(Test_two_inputs, basic) randu(firstInp, 0, 100); randu(secondInp, 0, 100); -#ifndef CV_CXX11 std::vector input_names; input_names.push_back("data"); input_names.push_back("second_input"); net.setInputsNames(input_names); -#else - net.setInputsNames({"data", "second_input"}); -#endif net.setInput(firstInp, "data", kScale); net.setInput(secondInp, "second_input", kScaleInv); net.setPreferableBackend(backendId); diff --git a/modules/imgcodecs/src/loadsave.cpp b/modules/imgcodecs/src/loadsave.cpp index 734f7b516c..79db3ac14d 100644 --- a/modules/imgcodecs/src/loadsave.cpp +++ b/modules/imgcodecs/src/loadsave.cpp @@ -210,15 +210,8 @@ struct ImageCodecInitializer static ImageCodecInitializer& getCodecs() { -#ifdef CV_CXX11 static ImageCodecInitializer g_codecs; return g_codecs; -#else - // C++98 doesn't guarantee correctness of multi-threaded initialization of static global variables - // (memory leak here is not critical, use C++11 to avoid that) - static ImageCodecInitializer* g_codecs = new ImageCodecInitializer(); - return *g_codecs; -#endif } /** diff --git a/modules/objdetect/src/precomp.hpp b/modules/objdetect/src/precomp.hpp index 790a980697..63ca440076 100644 --- a/modules/objdetect/src/precomp.hpp +++ b/modules/objdetect/src/precomp.hpp @@ -52,5 +52,7 @@ #include "opencv2/core/private.hpp" #include +#include +#include #endif diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp index dd127b38b0..3590136e3b 100644 --- a/modules/objdetect/src/qrcode.cpp +++ b/modules/objdetect/src/qrcode.cpp @@ -15,6 +15,7 @@ #include "quirc.h" #endif +#include #include #include #include diff --git a/modules/objdetect/test/test_precomp.hpp b/modules/objdetect/test/test_precomp.hpp index 88b8e9a4f5..452a0d78d6 100644 --- a/modules/objdetect/test/test_precomp.hpp +++ b/modules/objdetect/test/test_precomp.hpp @@ -7,10 +7,6 @@ #include "opencv2/ts.hpp" #include "opencv2/objdetect.hpp" -#if defined CV_CXX11 - #include -#else - #include -#endif +#include #endif diff --git a/modules/objdetect/test/test_qrcode_encode.cpp b/modules/objdetect/test/test_qrcode_encode.cpp index 7f5eb37f09..45567b5d9b 100644 --- a/modules/objdetect/test/test_qrcode_encode.cpp +++ b/modules/objdetect/test/test_qrcode_encode.cpp @@ -5,16 +5,6 @@ #include "test_precomp.hpp" namespace opencv_test { namespace { -#if !defined CV_CXX11 -// Wrapper for generating seeded random number via std::rand. -template -class SeededRandFunctor { -public: - SeededRandFunctor() { std::srand(Seed); } - int operator()(int i) { return std::rand() % (i + 1); } -}; -#endif - std::string encode_qrcode_images_name[] = { "version1_mode1.png", "version1_mode2.png", "version1_mode4.png", "version2_mode1.png", "version2_mode2.png", "version2_mode4.png", diff --git a/modules/python/src2/cv2_util.cpp b/modules/python/src2/cv2_util.cpp index d3691d3a59..817a4a8eff 100644 --- a/modules/python/src2/cv2_util.cpp +++ b/modules/python/src2/cv2_util.cpp @@ -128,11 +128,7 @@ void pyPopulateArgumentConversionErrors() PySafeObject exception_message(PyObject_Str(exception_value)); std::string message; getUnicodeString(exception_message, message); -#ifdef CV_CXX11 conversionErrorsTLS.getRef().push_back(std::move(message)); -#else - conversionErrorsTLS.getRef().push_back(message); -#endif } } diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py index 34bcd585ce..d4a5b26a9f 100755 --- a/modules/python/src2/hdr_parser.py +++ b/modules/python/src2/hdr_parser.py @@ -455,8 +455,7 @@ class CppHeaderParser(object): ("CV_INLINE", ""), ("CV_DEPRECATED", ""), ("CV_DEPRECATED_EXTERNAL", ""), - ("CV_NODISCARD_STD", ""), - ("CV_NODISCARD", "")]).strip() + ("CV_NODISCARD_STD", "")]).strip() if decl_str.strip().startswith('virtual'): virtual_method = True diff --git a/modules/stitching/src/precomp.hpp b/modules/stitching/src/precomp.hpp index debc0d2088..2a1177496a 100644 --- a/modules/stitching/src/precomp.hpp +++ b/modules/stitching/src/precomp.hpp @@ -45,6 +45,7 @@ #include "opencv2/opencv_modules.hpp" +#include #include #include #include diff --git a/modules/ts/include/opencv2/ts.hpp b/modules/ts/include/opencv2/ts.hpp index 86f2d07761..a768d0047b 100644 --- a/modules/ts/include/opencv2/ts.hpp +++ b/modules/ts/include/opencv2/ts.hpp @@ -941,13 +941,9 @@ namespace opencv_test { using namespace cvtest; using namespace cv; -#ifdef CV_CXX11 #define CVTEST_GUARD_SYMBOL(name) \ class required_namespace_specificatin_here_for_symbol_ ## name {}; \ using name = required_namespace_specificatin_here_for_symbol_ ## name; -#else -#define CVTEST_GUARD_SYMBOL(name) /* nothing */ -#endif CVTEST_GUARD_SYMBOL(norm) CVTEST_GUARD_SYMBOL(add) diff --git a/modules/videoio/src/cap_mfx_common.hpp b/modules/videoio/src/cap_mfx_common.hpp index 9824e89dc5..b10d7115ba 100644 --- a/modules/videoio/src/cap_mfx_common.hpp +++ b/modules/videoio/src/cap_mfx_common.hpp @@ -334,26 +334,11 @@ protected: // TODO: move to core::util? -#ifdef CV_CXX11 #include static void sleep_ms(int64 ms) { std::this_thread::sleep_for(std::chrono::milliseconds(ms)); } -#elif defined(__linux__) -#include -static void sleep_ms(int64 ms) -{ - nanosleep(ms * 1000 * 1000); -} -#elif defined _WIN32 -static void sleep_ms(int64 ms) -{ - Sleep(ms); -} -#else -#error "Can not detect sleep_ms() implementation" -#endif // Linux specific diff --git a/modules/videoio/src/cap_msmf.cpp b/modules/videoio/src/cap_msmf.cpp index 6fbcd2aa02..93545c615e 100644 --- a/modules/videoio/src/cap_msmf.cpp +++ b/modules/videoio/src/cap_msmf.cpp @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include diff --git a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp index 2dcc1ff107..5fbe81cd1d 100644 --- a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp +++ b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp @@ -2,6 +2,8 @@ #include #include +#define PARALLEL_FOR_LAMBDA + using namespace std; using namespace cv; @@ -33,6 +35,8 @@ int mandelbrotFormula(const complex &z0, const int maxIter=500) { } //! [mandelbrot-grayscale-value] +#ifndef PARALLEL_FOR_LAMBDA + //! [mandelbrot-parallel] class ParallelMandelbrot : public ParallelLoopBody { @@ -71,6 +75,8 @@ private: }; //! [mandelbrot-parallel] +#endif // !PARALLEL_FOR_LAMBDA + //! [mandelbrot-sequential] void sequentialMandelbrot(Mat &img, const float x1, const float y1, const float scaleX, const float scaleY) { @@ -102,7 +108,7 @@ int main() double t1 = (double) getTickCount(); - #ifdef CV_CXX11 +#ifdef PARALLEL_FOR_LAMBDA //! [mandelbrot-parallel-call-cxx11] parallel_for_(Range(0, mandelbrotImg.rows*mandelbrotImg.cols), [&](const Range& range){ @@ -121,14 +127,14 @@ int main() }); //! [mandelbrot-parallel-call-cxx11] - #else +#else // PARALLEL_FOR_LAMBDA //! [mandelbrot-parallel-call] ParallelMandelbrot parallelMandelbrot(mandelbrotImg, x1, y1, scaleX, scaleY); parallel_for_(Range(0, mandelbrotImg.rows*mandelbrotImg.cols), parallelMandelbrot); //! [mandelbrot-parallel-call] - #endif +#endif // PARALLEL_FOR_LAMBDA t1 = ((double) getTickCount() - t1) / getTickFrequency(); cout << "Parallel Mandelbrot: " << t1 << " s" << endl; diff --git a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_new.cpp b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_new.cpp index cfa9d22b0d..cab73874a4 100644 --- a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_new.cpp +++ b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_new.cpp @@ -4,6 +4,8 @@ #include #include +#define PARALLEL_FOR_LAMBDA + using namespace std; using namespace cv; @@ -47,7 +49,8 @@ void conv_seq(Mat src, Mat &dst, Mat kernel) } //! [convolution-sequential] -#ifdef CV_CXX11 +#ifdef PARALLEL_FOR_LAMBDA + void conv_parallel(Mat src, Mat &dst, Mat kernel) { int rows = src.rows, cols = src.cols; @@ -118,7 +121,8 @@ void conv_parallel_row_split(Mat src, Mat &dst, Mat kernel) }); //! [convolution-parallel-cxx11-row-split] } -#else + +#else // PARALLEL_FOR_LAMBDA //! [convolution-parallel] class parallelConvolution : public ParallelLoopBody @@ -235,7 +239,7 @@ void conv_parallel_row_split(Mat src, Mat &dst, Mat kernel) //! [convolution-parallel-function-row] } -#endif +#endif // PARALLEL_FOR_LAMBDA static void help(char *progName) { @@ -329,4 +333,4 @@ int main(int argc, char *argv[]) // imwrite("dst.png", dst); return 0; -} \ No newline at end of file +} diff --git a/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp b/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp index ac1c205258..d9e0d1f94d 100644 --- a/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp +++ b/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp @@ -59,12 +59,12 @@ int main(int,char**) cout << "C = " << endl << " " << C << endl << endl; //! [comma] // do the same with initializer_list -#ifdef CV_CXX11 + //! [list] C = (Mat_({0, -1, 0, -1, 5, -1, 0, -1, 0})).reshape(3); cout << "C = " << endl << " " << C << endl << endl; //! [list] -#endif + //! [clone] Mat RowClone = C.row(1).clone(); cout << "RowClone = " << endl << " " << RowClone << endl << endl; diff --git a/samples/dnn/object_detection.cpp b/samples/dnn/object_detection.cpp index 6fc8b2ab61..a0c255fd4e 100644 --- a/samples/dnn/object_detection.cpp +++ b/samples/dnn/object_detection.cpp @@ -5,7 +5,7 @@ #include #include -#if defined(CV_CXX11) && defined(HAVE_THREADS) +#if defined(HAVE_THREADS) #define USE_THREADS 1 #endif From dc987c094aacd2388c8b3e3f32b982bbdcce20f3 Mon Sep 17 00:00:00 2001 From: Jun <> Date: Sat, 20 Jan 2024 19:57:23 -0600 Subject: [PATCH 56/57] Update windows_install.markdown - add set opencv path for vc17 --- .../introduction/windows_install/windows_install.markdown | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/tutorials/introduction/windows_install/windows_install.markdown b/doc/tutorials/introduction/windows_install/windows_install.markdown index eabf31482f..2568592d0c 100644 --- a/doc/tutorials/introduction/windows_install/windows_install.markdown +++ b/doc/tutorials/introduction/windows_install/windows_install.markdown @@ -378,6 +378,9 @@ our OpenCV library that we use in our projects. Start up a command window and en setx OpenCV_DIR D:\OpenCV\build\x64\vc16 (suggested for Visual Studio 2019 - 64 bit Windows) setx OpenCV_DIR D:\OpenCV\build\x86\vc16 (suggested for Visual Studio 2019 - 32 bit Windows) + + setx OpenCV_DIR D:\OpenCV\build\x64\vc17 (suggested for Visual Studio 2022 - 64 bit Windows) + setx OpenCV_DIR D:\OpenCV\build\x86\vc17 (suggested for Visual Studio 2022 - 32 bit Windows) @endcode Here the directory is where you have your OpenCV binaries (*extracted* or *built*). You can have different platform (e.g. x64 instead of x86) or compiler type, so substitute appropriate value. From 37c76b815c50ad6bd65505156eb20b5a28871d6a Mon Sep 17 00:00:00 2001 From: AleksandrPanov Date: Tue, 23 Jan 2024 13:45:28 +0300 Subject: [PATCH 57/57] fix generate of charuco chessboard image, add test --- modules/objdetect/src/aruco/aruco_board.cpp | 66 ++++++++++--------- .../objdetect/test/test_charucodetection.cpp | 51 ++++++++++++++ 2 files changed, 86 insertions(+), 31 deletions(-) diff --git a/modules/objdetect/src/aruco/aruco_board.cpp b/modules/objdetect/src/aruco/aruco_board.cpp index 3d4217e02a..f8d3d3c108 100644 --- a/modules/objdetect/src/aruco/aruco_board.cpp +++ b/modules/objdetect/src/aruco/aruco_board.cpp @@ -483,39 +483,44 @@ void CharucoBoardImpl::generateImage(Size outSize, OutputArray img, int marginSi Mat noMarginsImg = out.colRange(marginSize, out.cols - marginSize).rowRange(marginSize, out.rows - marginSize); - double totalLengthX, totalLengthY; - totalLengthX = squareLength * size.width; - totalLengthY = squareLength * size.height; - - // proportional transformation - double xReduction = totalLengthX / double(noMarginsImg.cols); - double yReduction = totalLengthY / double(noMarginsImg.rows); + // the size of the chessboard square depends on the location of the chessboard + float pixInSquare = 0.f; + // the size of the chessboard in pixels + Size pixInChessboard(noMarginsImg.cols, noMarginsImg.rows); // determine the zone where the chessboard is placed - Mat chessboardZoneImg; - if(xReduction > yReduction) { - int nRows = int(totalLengthY / xReduction); - int rowsMargins = (noMarginsImg.rows - nRows) / 2; - chessboardZoneImg = noMarginsImg.rowRange(rowsMargins, noMarginsImg.rows - rowsMargins); - } else { - int nCols = int(totalLengthX / yReduction); - int colsMargins = (noMarginsImg.cols - nCols) / 2; - chessboardZoneImg = noMarginsImg.colRange(colsMargins, noMarginsImg.cols - colsMargins); + float pixInSquareX = (float)noMarginsImg.cols / (float)size.width; + float pixInSquareY = (float)noMarginsImg.rows / (float)size.height; + Point startChessboard(0, 0); + if (pixInSquareX <= pixInSquareY) { + // the width of "noMarginsImg" image determines the dimensions of the chessboard + pixInSquare = pixInSquareX; + pixInChessboard.height = cvRound(pixInSquare*size.height); + int rowsMargin = (noMarginsImg.rows - pixInChessboard.height) / 2; + startChessboard.y = rowsMargin; } + else { + // the height of "noMarginsImg" image determines the dimensions of the chessboard + pixInSquare = pixInSquareY; + pixInChessboard.width = cvRound(pixInSquare*size.width); + int colsMargin = (noMarginsImg.cols - pixInChessboard.width) / 2; + startChessboard.x = colsMargin; + } + // determine the zone where the chessboard is located + Mat chessboardZoneImg = noMarginsImg(Rect(startChessboard, pixInChessboard)); - // determine the margins to draw only the markers - // take the minimum just to be sure - double squareSizePixels = min(double(chessboardZoneImg.cols) / double(size.width), - double(chessboardZoneImg.rows) / double(size.height)); + // marker size in pixels + const float pixInMarker = markerLength/squareLength*pixInSquare; + // the size of the marker margin in pixels + const float pixInMarginMarker = 0.5f*(pixInSquare - pixInMarker); - double diffSquareMarkerLength = (squareLength - markerLength) / 2; - int diffSquareMarkerLengthPixels = - int(diffSquareMarkerLength * squareSizePixels / squareLength); + // determine the zone where the aruco markers are located + int endArucoX = cvRound(pixInSquare*(size.width-1)+pixInMarginMarker+pixInMarker); + int endArucoY = cvRound(pixInSquare*(size.height-1)+pixInMarginMarker+pixInMarker); + Mat arucoZone = chessboardZoneImg(Range(cvRound(pixInMarginMarker), endArucoY), Range(cvRound(pixInMarginMarker), endArucoX)); // draw markers - Mat markersImg; - Board::Impl::generateImage(chessboardZoneImg.size(), markersImg, diffSquareMarkerLengthPixels, borderBits); - markersImg.copyTo(chessboardZoneImg); + Board::Impl::generateImage(arucoZone.size(), arucoZone, 0, borderBits); // now draw black squares for(int y = 0; y < size.height; y++) { @@ -527,12 +532,11 @@ void CharucoBoardImpl::generateImage(Size outSize, OutputArray img, int marginSi if(y % 2 != x % 2) continue; // white corner, dont do anything } - double startX, startY; - startX = squareSizePixels * double(x); - startY = squareSizePixels * double(y); + float startX = pixInSquare * float(x); + float startY = pixInSquare * float(y); - Mat squareZone = chessboardZoneImg.rowRange(int(startY), int(startY + squareSizePixels)) - .colRange(int(startX), int(startX + squareSizePixels)); + Mat squareZone = chessboardZoneImg(Range(cvRound(startY), cvRound(startY + pixInSquare)), + Range(cvRound(startX), cvRound(startX + pixInSquare))); squareZone.setTo(0); } diff --git a/modules/objdetect/test/test_charucodetection.cpp b/modules/objdetect/test/test_charucodetection.cpp index 87520c873e..20a7036b40 100644 --- a/modules/objdetect/test/test_charucodetection.cpp +++ b/modules/objdetect/test/test_charucodetection.cpp @@ -769,6 +769,57 @@ TEST_P(CharucoBoard, testWrongSizeDetection) ASSERT_TRUE(detectedCharucoIds.empty()); } +TEST(CharucoBoardGenerate, issue_24806) +{ + aruco::Dictionary dict = aruco::getPredefinedDictionary(aruco::DICT_4X4_1000); + const float squareLength = 13.f, markerLength = 10.f; + const Size boardSize(7ull, 4ull); + const aruco::CharucoBoard board(boardSize, squareLength, markerLength, dict); + const int marginSize = 24; + Mat boardImg; + + // generate chessboard image + board.generateImage(Size(400, 300), boardImg, marginSize); + // This condition checks that the width of the image determines the dimensions of the chessboard in this test + CV_Assert((float)(boardImg.cols) / (float)boardSize.width <= + (float)(boardImg.rows) / (float)boardSize.height); + + // prepare data for chessboard image test + Mat noMarginsImg = boardImg(Range(marginSize, boardImg.rows - marginSize), + Range(marginSize, boardImg.cols - marginSize)); + const float pixInSquare = (float)(noMarginsImg.cols) / (float)boardSize.width; + + Size pixInChessboard(cvRound(pixInSquare*boardSize.width), cvRound(pixInSquare*boardSize.height)); + const Point startChessboard((noMarginsImg.cols - pixInChessboard.width) / 2, + (noMarginsImg.rows - pixInChessboard.height) / 2); + Mat chessboardZoneImg = noMarginsImg(Rect(startChessboard, pixInChessboard)); + + // B - black pixel, W - white pixel + // chessboard corner 1: + // B W + // W B + Mat goldCorner1 = (Mat_(2, 2) << + 0, 255, + 255, 0); + // B - black pixel, W - white pixel + // chessboard corner 2: + // W B + // B W + Mat goldCorner2 = (Mat_(2, 2) << + 255, 0, + 0, 255); + + // test chessboard corners in generated image + for (const Point3f& p: board.getChessboardCorners()) { + Point2f chessCorner(pixInSquare*(p.x/squareLength), + pixInSquare*(p.y/squareLength)); + Mat winCorner = chessboardZoneImg(Rect(Point(cvRound(chessCorner.x) - 1, cvRound(chessCorner.y) - 1), Size(2, 2))); + bool eq = (cv::countNonZero(goldCorner1 != winCorner) == 0) | (cv::countNonZero(goldCorner2 != winCorner) == 0); + ASSERT_TRUE(eq); + } + // TODO: fix aruco generateImage and add test aruco corners for generated image +} + TEST(Charuco, testSeveralBoardsWithCustomIds) { Size res{500, 500};