Merge branch 4.x

This commit is contained in:
Alexander Alekhin 2024-11-22 02:32:17 +00:00
commit 7808d50412
20 changed files with 1381 additions and 1615 deletions

View File

@ -1,24 +1,7 @@
project(kleidicv_hal)
set(KLEIDICV_SOURCE_PATH "" CACHE PATH "Directory containing KleidiCV sources")
ocv_update(KLEIDICV_SRC_COMMIT "0.2.0")
ocv_update(KLEIDICV_SRC_HASH "dabe522e8f55ac342d07a787391dab80")
if(KLEIDICV_SOURCE_PATH)
set(THE_ROOT "${KLEIDICV_SOURCE_PATH}")
else()
ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
HASH ${KLEIDICV_SRC_HASH}
URL
"${OPENCV_KLEIDICV_URL}"
"$ENV{OPENCV_KLEIDICV_URL}"
"https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
DESTINATION_DIR "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/"
ID KLEIDICV
STATUS res
UNPACK RELATIVE_URL)
set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}")
if(HAVE_KLEIDICV)
option(KLEIDICV_ENABLE_SME2 "" OFF) # not compatible with some CLang versions in NDK
include("${KLEIDICV_SOURCE_PATH}/adapters/opencv/CMakeLists.txt")
endif()
option(KLEIDICV_ENABLE_SME2 "" OFF) # not compatible with some CLang versions in NDK
include("${THE_ROOT}/adapters/opencv/CMakeLists.txt")

21
3rdparty/kleidicv/kleidicv.cmake vendored Normal file
View File

@ -0,0 +1,21 @@
function(download_kleidicv root_var)
set(${root_var} "" PARENT_SCOPE)
ocv_update(KLEIDICV_SRC_COMMIT "0.2.0")
ocv_update(KLEIDICV_SRC_HASH "dabe522e8f55ac342d07a787391dab80")
set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv")
ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
HASH ${KLEIDICV_SRC_HASH}
URL
"${OPENCV_KLEIDICV_URL}"
"$ENV{OPENCV_KLEIDICV_URL}"
"https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
DESTINATION_DIR ${THE_ROOT}
ID KLEIDICV
STATUS res
UNPACK RELATIVE_URL)
if(res)
set(${root_var} "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}" PARENT_SCOPE)
endif()
endfunction()

View File

@ -861,7 +861,7 @@ if(NOT DEFINED OpenCV_HAL)
set(OpenCV_HAL "OpenCV_HAL")
endif()
if(WITH_KLEIDICV)
if(HAVE_KLEIDICV)
ocv_debug_message(STATUS "Enable KleidiCV acceleration")
if(NOT ";${OpenCV_HAL};" MATCHES ";kleidicv;")
set(OpenCV_HAL "kleidicv;${OpenCV_HAL}")

View File

@ -161,3 +161,19 @@ if(WITH_CLP)
endif()
endif()
endif(WITH_CLP)
# --- ARM KleidiCV
if(WITH_KLEIDICV)
if(KLEIDICV_SOURCE_PATH AND EXISTS "${KLEIDICV_SOURCE_PATH}/adapters/opencv/CMakeLists.txt")
set(HAVE_KLEIDICV ON)
endif()
if(NOT HAVE_KLEIDICV)
include("${OpenCV_SOURCE_DIR}/3rdparty/kleidicv/kleidicv.cmake")
download_kleidicv(KLEIDICV_SOURCE_PATH)
if(KLEIDICV_SOURCE_PATH)
set(HAVE_KLEIDICV ON)
endif()
else()
set(HAVE_KLEIDICV OFF)
endif()
endif(WITH_KLEIDICV)

View File

@ -613,7 +613,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
typedef int (*ScalarFunc)(const uchar* src, size_t step_src,
uchar* dst, size_t step_dst, int width, int height,
void* scalar, bool scalarIsFirst);
void* scalar, bool scalarIsFirst, int nChannels);
typedef int (*ExtendedTypeFunc)(const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
@ -887,7 +887,6 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
for( size_t j = 0; j < total; j += blocksize )
{
int bsz = (int)MIN(total - j, blocksize);
Size bszn(bsz*cn, 1);
const uchar *sptr1 = ptrs[0];
const uchar* sptr2 = buf2;
uchar* dptr = ptrs[1];
@ -900,17 +899,17 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
// try to perform operation in 1 call, fallback to classic way if fail
uchar* opconverted = haveMask ? maskbuf : dptr;
if (!scalarFunc || src2.total() != 1 ||
scalarFunc(extSptr1, 1, opconverted, 1, bszn.width, bszn.height, (void*)extSptr2, swapped12) != 0)
scalarFunc(extSptr1, 1, opconverted, 1, bsz, 1, (void*)extSptr2, swapped12, cn) != 0)
{
// try to perform operation with conversion in one call
// if fail, use converter functions
if (!extendedFunc || extendedFunc(extSptr1, 1, extSptr2, 1, opconverted, 1,
bszn.width, bszn.height, usrdata) != 0)
bsz*cn, 1, usrdata) != 0)
{
if( cvtsrc1 )
{
cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
cvtsrc1( sptr1, 1, 0, 1, buf1, 1, Size(bsz*cn, 1), 0 );
sptr1 = buf1;
}
@ -918,12 +917,12 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
std::swap(sptr1, sptr2);
uchar* fdst = ( haveMask || cvtdst ) ? wbuf : dptr;
func( sptr1, 1, sptr2, 1, fdst, 1, bszn.width, bszn.height, usrdata );
func( sptr1, 1, sptr2, 1, fdst, 1, bsz*cn, 1, usrdata );
if (cvtdst)
{
uchar* cdst = haveMask ? maskbuf : dptr;
cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0);
cvtdst(wbuf, 1, 0, 1, cdst, 1, Size(bsz*cn, 1), 0);
}
opconverted = cvtdst ? maskbuf : wbuf;
}
@ -965,9 +964,9 @@ static BinaryFuncC* getAddTab()
}
static int addScalar32f32fWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
void* scalar, bool /*scalarIsFirst*/)
void* scalar, bool /*scalarIsFirst*/, int nChannels)
{
int res = cv_hal_addScalar32f32f((const float*)src, step_src, (float *)dst, step_dst, width, height, (const float*)scalar);
int res = cv_hal_addScalar32f32f((const float*)src, step_src, (float *)dst, step_dst, width, height, (const float*)scalar, nChannels);
if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
return res;
else
@ -978,9 +977,9 @@ static int addScalar32f32fWrapper(const uchar* src, size_t step_src, uchar* dst,
}
static int addScalar16s16sWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
void* scalar, bool /*scalarIsFirst*/)
void* scalar, bool /*scalarIsFirst*/, int nChannels)
{
int res = cv_hal_addScalar16s16s((const int16_t*)src, step_src, (int16_t *)dst, step_dst, width, height, (const int16_t*)scalar);
int res = cv_hal_addScalar16s16s((const int16_t*)src, step_src, (int16_t *)dst, step_dst, width, height, (const int16_t*)scalar, nChannels);
if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
return res;
else
@ -1094,6 +1093,67 @@ static BinaryFuncC* getAbsDiffTab()
return absDiffTab;
}
static int absDiffScalar32f32fWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
void* scalar, bool /*scalarIsFirst*/, int nChannels)
{
int res = cv_hal_absDiffScalar32f32f((const float*)src, step_src, (float *)dst, step_dst, width, height, (const float*)scalar, nChannels);
if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
return res;
else
{
CV_Error_(cv::Error::StsInternal, ("HAL implementation addScalar32f32f ==> " CVAUX_STR(cv_hal_addScalar32f32f)
" returned %d (0x%08x)", res, res));
}
}
static int absDiffScalar32s32uWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
void* scalar, bool /*scalarIsFirst*/, int nChannels)
{
int res = cv_hal_absDiffScalar32s32u((const int*)src, step_src, (uint32_t*)dst, step_dst, width, height, (const int*)scalar, nChannels);
if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
return res;
else
{
CV_Error_(cv::Error::StsInternal, ("HAL implementation addScalar32f32f ==> " CVAUX_STR(cv_hal_addScalar32f32f)
" returned %d (0x%08x)", res, res));
}
}
static int absDiffScalar8u8uWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
void* scalar, bool /*scalarIsFirst*/, int nChannels)
{
int res = cv_hal_absDiffScalar8u8u((const uchar*)src, step_src, (uchar*)dst, step_dst, width, height, (const uchar*)scalar, nChannels);
if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
return res;
else
{
CV_Error_(cv::Error::StsInternal, ("HAL implementation addScalar32f32f ==> " CVAUX_STR(cv_hal_addScalar32f32f)
" returned %d (0x%08x)", res, res));
}
}
static ScalarFunc getAbsDiffScalarFunc(int srcType, int dstType)
{
if (srcType == CV_32F && dstType == CV_32F)
{
return absDiffScalar32f32fWrapper;
}
// resulting type is 32U in fact
else if (srcType == CV_32S && dstType == CV_32S)
{
return absDiffScalar32s32uWrapper;
}
else if (srcType == CV_8U && dstType == CV_8U)
{
return absDiffScalar8u8uWrapper;
}
else
{
return nullptr;
}
}
}
void cv::add( InputArray src1, InputArray src2, OutputArray dst,
@ -1108,7 +1168,17 @@ void cv::add( InputArray src1, InputArray src2, OutputArray dst,
return;
}
ScalarFunc scalarFunc = getAddScalarFunc(src1.depth(), dtype < 0 ? dst.depth() : dtype);
int sdepth = src1.depth();
if (checkScalar(src1, src1.type(), src1.kind(), _InputArray::MATX))
{
sdepth = src2.depth();
}
if (checkScalar(src2, src2.type(), src2.kind(), _InputArray::MATX))
{
sdepth = src1.depth();
}
ScalarFunc scalarFunc = getAddScalarFunc(sdepth, dtype < 0 ? dst.depth() : dtype);
arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD, nullptr,
/* scalarFunc */ scalarFunc );
}
@ -1141,7 +1211,18 @@ void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
return;
}
arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
int sdepth = src1.depth();
if (checkScalar(src1, src1.type(), src1.kind(), _InputArray::MATX))
{
sdepth = src2.depth();
}
if (checkScalar(src2, src2.type(), src2.kind(), _InputArray::MATX))
{
sdepth = src1.depth();
}
ScalarFunc scalarFunc = getAbsDiffScalarFunc(sdepth, dst.depth());
arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF,
/* extendedFunc */ nullptr, scalarFunc);
}
void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask)

View File

@ -119,9 +119,10 @@ Add scalar: _dst[i] = src[i] + scalar
@param width width of the images
@param height height of the images
@param scalar_data pointer to scalar value
@param nChannels number of channels per element
*/
inline int hal_ni_addScalar32f32f(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, int width, int height, const float* scalar_data) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_addScalar16s16s(const int16_t *src_data, size_t src_step, int16_t *dst_data, size_t dst_step, int width, int height, const int16_t* scalar_data) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_addScalar32f32f(const float* src_data, size_t src_step, float* dst_data, size_t dst_step, int width, int height, const float* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_addScalar16s16s(const int16_t* src_data, size_t src_step, int16_t* dst_data, size_t dst_step, int width, int height, const int16_t* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
//! @}
/**
@ -190,6 +191,23 @@ inline int hal_ni_absdiff64u(const uint64 *src1_data, size_t src1_step, const ui
inline int hal_ni_absdiff64s(const int64 *src1_data, size_t src1_step, const int64 *src2_data, size_t src2_step, int64 *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_absdiff16f(const cv_hal_f16 *src1_data, size_t src1_step, const cv_hal_f16 *src2_data, size_t src2_step, cv_hal_f16 *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_absdiff16bf(const cv_hal_bf16 *src1_data, size_t src1_step, const cv_hal_bf16 *src2_data, size_t src2_step, cv_hal_bf16 *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
/*
Absolute difference with scalar: _dst[i] = | src[i] - scalar |_
@param src_data source image data
@param src_step source image step
@param dst_data destination image data
@param dst_step destination image step
@param width width of the images
@param height height of the images
@param scalar_data pointer to scalar value
@param nChannels number of channels per element
*/
inline int hal_ni_absDiffScalar32f32f(const float* src_data, size_t src_step, float* dst_data, size_t dst_step, int width, int height, const float* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_absDiffScalar32s32u(const int* src_data, size_t src_step, uint32_t* dst_data, size_t dst_step, int width, int height, const int* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_absDiffScalar8u8u (const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, const uchar* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
//! @}
/**
@ -279,6 +297,9 @@ inline int hal_ni_not8u(const uchar *src_data, size_t src_step, uchar *dst_data,
#define cv_hal_absdiff64f hal_ni_absdiff64f
#define cv_hal_absdiff16f hal_ni_absdiff16f
#define cv_hal_absdiff16bf hal_ni_absdiff16bf
#define cv_hal_absDiffScalar32f32f hal_ni_absDiffScalar32f32f
#define cv_hal_absDiffScalar32s32u hal_ni_absDiffScalar32s32u
#define cv_hal_absDiffScalar8u8u hal_ni_absDiffScalar8u8u
#define cv_hal_and8u hal_ni_and8u
#define cv_hal_or8u hal_ni_or8u
#define cv_hal_xor8u hal_ni_xor8u

View File

@ -8,7 +8,7 @@ ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX RVV LASX)
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2 NEON NEON_FP16)
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON_FP16)
ocv_add_dispatched_file("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON NEON_FP16)
ocv_add_dispatched_file_force_all("layers/cpu_kernels/fast_gemm_kernels" AVX AVX2 NEON LASX)
ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)

View File

@ -12,28 +12,21 @@
#include "../../precomp.hpp"
#include "convolution.hpp"
#include "conv_winograd_f63.simd.hpp"
#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace cv { namespace dnn {
#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2
enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
/*Input transform*/
void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
float* outptr, int Cg, const int winoIblock, const int winoAtomF32);
/*Output transform*/
void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct);
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
{
const cv::dnn::Winofunc func =
conv->useFP16 ? cv::dnn::getWinofunc_F16()
: (conv->useAVX || conv->useAVX2 || conv->useNEON || conv->useRVV || conv->useSIMD128) ? cv::dnn::getWinofunc_F32()
: cv::dnn::Winofunc::empty();
if (!func.isGood())
return 0;
Mat input = _input.getMat();
Mat output = _output.getMat();
Mat fusedAddMat = _fusedAddMat.getMat();
@ -52,42 +45,10 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups;
const int CONV_WINO_KBLOCK = 4;
#if (CV_NEON && CV_NEON_AARCH64)
const int CONV_WINO_IBLOCK = 6;
#elif CV_TRY_AVX || CV_TRY_AVX2
const int CONV_WINO_IBLOCK = (conv->useAVX || conv->useAVX2) ? 6 : 3;
#else
const int CONV_WINO_IBLOCK = 3;
#endif
#if CV_TRY_AVX || CV_TRY_AVX2
const int CONV_WINO_ATOM_F32 = (conv->useAVX || conv->useAVX2) ? 8 : 4;
#else
const int CONV_WINO_ATOM_F32 = 4;
#endif
const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
int CONV_WINO_ATOM = CONV_WINO_ATOM_F32;
int CONV_WINO_NATOMS = CONV_WINO_NATOMS_F32;
#ifdef CONV_ARM_FP16
// FP 16
const int CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2;
const int CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16;
#endif
int esz = sizeof(float );
#ifdef CONV_ARM_FP16
const bool useFP16 = conv->useFP16;
if (useFP16)
{
// works at FP 16.
CONV_WINO_ATOM = CONV_WINO_ATOM_F16;
CONV_WINO_NATOMS = CONV_WINO_NATOMS_F16;
esz = sizeof(__fp16);
}
#endif
const int CONV_WINO_IBLOCK = func.iblock;
const int CONV_WINO_ATOM = func.natom;
const int CONV_WINO_NATOMS = CONV_WINO_AREA / CONV_WINO_ATOM;
const int esz = func.esz;
int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
const size_t inp_planesize = (size_t)Hi*Wi;
@ -175,35 +136,7 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
inptr = inpbuf;
inpstep = CONV_WINO_SIZE;
}
#if CV_TRY_AVX2
if (conv->useAVX2)
opt_AVX2::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
else
#endif
#if CV_TRY_AVX
if (conv->useAVX)
opt_AVX::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
else
#endif
#if CV_NEON && CV_NEON_AARCH64
if (conv->useNEON)
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
opt_NEON_FP16::winofunc_BtXB_8x8_F16(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK,
CONV_WINO_ATOM);
}
else
#endif
opt_NEON::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK,
CONV_WINO_ATOM);
}
else
#endif
winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
func.BtXB_8x8(inptr, inpstep, (uchar*)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
}
else
{
@ -219,18 +152,20 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
// apply inverse Winograd transforms to the sums,
// add bias, apply activation function if any and store the results.
char* wptr0 = nullptr;
#ifdef CONV_ARM_FP16
if (useFP16)
if (esz == 2)
{
CV_Assert(!conv->weightsWinoBuf_FP16.empty());
wptr0 = (char *)conv->getWeightsWinoFP16();
}
else
#endif
else if (esz == 4)
{
CV_Assert(!conv->weightsWinoBuf.empty());
wptr0 = (char *)conv->getWeightsWino();
}
else
{
CV_Error(Error::StsError, "Impossible configuration");
}
parallel_for_(Range(0, ntasks), [&](const Range& r0) {
for (int task_id = r0.start; task_id < r0.end; task_id++)
@ -271,36 +206,9 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
char* inwptr = wbuf_all + inwofs * esz;
char* wptr = wptr0 + wofs * esz;
#if CV_TRY_AVX2
if (conv->useAVX2)
opt_AVX2::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
else
#endif
#if CV_TRY_AVX
if (conv->useAVX)
opt_AVX::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
else
#endif
#if CV_NEON && CV_NEON_AARCH64
if (conv->useNEON)
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
opt_NEON_FP16::winofunc_accum_F16(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
}
else
#endif
opt_NEON::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
}
else
#endif
winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
func.accum((uchar*)inwptr, (uchar*)wptr, (uchar*)out_wbuf, Cg,
block_id1 - block_id0, CONV_WINO_IBLOCK,
CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
for (int k = k0; k < k1; k++)
{
@ -336,37 +244,10 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
dx1*sizeof(pbptr0[0]));
}
}
#if CV_TRY_AVX2
if (conv->useAVX2)
opt_AVX2::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
else
#endif
#if CV_TRY_AVX
if (conv->useAVX)
opt_AVX::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
else
#endif
#if CV_NEON && CV_NEON_AARCH64
// NEON optimization is only for ARMv8 device, and for ARMv7 device, we use the Universal intrinsics.
if (conv->useNEON)
{
#ifdef CONV_ARM_FP16
if (useFP16)
{
opt_NEON_FP16::winofunc_AtXA_8x8_F16(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA * esz, CONV_WINO_SIZE,
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
}
else
#endif
opt_NEON::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
}
else
#endif
winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
const int count = ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA;
func.AtXA_8x8((uchar*)out_wbuf + count * esz, CONV_WINO_SIZE,
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
if (partial)
{
@ -383,441 +264,4 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
return 1;
}
/****************************************************************************************\
SIMD for winograd function
\****************************************************************************************/
#if CV_SIMD128
void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
{
#if 1
CV_Assert(winoIblock == 3 && winoKblock == 4 && winoAtomF32 == 4);
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
outbuf += winoAtomF32)
{
v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
v_float32x4 s10 = v_setzero_f32(), s11 = s00, s12 = s00;
v_float32x4 s20 = v_setzero_f32(), s21 = s00, s22 = s00;
v_float32x4 s30 = v_setzero_f32(), s31 = s00, s32 = s00;
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
wptr += winoKblock*winoAtomF32)
{
v_float32x4 x0, x1, x2;
x0 = v_load(inwptr);
x1 = v_load(inwptr + 4);
x2 = v_load(inwptr + 8);
v_float32x4 w0 = v_load(wptr);
s00 = v_fma(w0, x0, s00);
s01 = v_fma(w0, x1, s01);
s02 = v_fma(w0, x2, s02);
w0 = v_load(wptr + 4);
s10 = v_fma(w0, x0, s10);
s11 = v_fma(w0, x1, s11);
s12 = v_fma(w0, x2, s12);
w0 = v_load(wptr + 8);
s20 = v_fma(w0, x0, s20);
s21 = v_fma(w0, x1, s21);
s22 = v_fma(w0, x2, s22);
w0 = v_load(wptr + 12);
s30 = v_fma(w0, x0, s30);
s31 = v_fma(w0, x1, s31);
s32 = v_fma(w0, x2, s32);
}
v_store(outbuf, s00);
v_store(outbuf + 1*64, s01);
v_store(outbuf + 2*64, s02);
v_store(outbuf + 3*64, s10);
v_store(outbuf + 4*64, s11);
v_store(outbuf + 5*64, s12);
v_store(outbuf + 6*64, s20);
v_store(outbuf + 7*64, s21);
v_store(outbuf + 8*64, s22);
v_store(outbuf + 9*64, s30);
v_store(outbuf + 10*64, s31);
v_store(outbuf + 11*64, s32);
}
#else
// Naive C++ code, the code should never be run here.
for (int atom_id = 0; atom_id < winoNatomF32;
atom_id++, outbuf += winoAtomF32)
{
float sumbuf[winoIblock*winoKblock*winoAtomF32];
memset(sumbuf, 0, sizeof(sumbuf));
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
wptr += winoKblock*winoAtomF32)
{
for (int i = 0; i < winoKblock; i++)
{
for (int j = 0; j < winoIblock; j++)
{
int i_ = i*winoAtomF32;
int j_ = j*winoAtomF32;
int ij_ = i_*winoIblock + j_;
float s0 = inwptr[j_ + 0]*wptr[i_ + 0];
float s1 = inwptr[j_ + 1]*wptr[i_ + 1];
float s2 = inwptr[j_ + 2]*wptr[i_ + 2];
float s3 = inwptr[j_ + 3]*wptr[i_ + 3];
sumbuf[ij_ + 0] += s0;
sumbuf[ij_ + 1] += s1;
sumbuf[ij_ + 2] += s2;
sumbuf[ij_ + 3] += s3;
}
}
}
for (int ij = 0; ij < winoKblock*winoIblock; ij++)
{
int ij_ = ij*winoAtomF32;
int ij_out = ij*CONV_WINO_AREA;
outbuf[ij_out + 0] = sumbuf[ij_ + 0];
outbuf[ij_out + 1] = sumbuf[ij_ + 1];
outbuf[ij_out + 2] = sumbuf[ij_ + 2];
outbuf[ij_out + 3] = sumbuf[ij_ + 3];
}
}
#endif
}
/*Input transform*/
void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
{
CV_Assert(winoIblock == 3 && winoAtomF32 == 4);
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4);
v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4);
v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4);
v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4);
v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4);
v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
{
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11;
t00 = v_sub(x40, x20);
t01 = v_sub(x41, x21);
t10 = v_sub(x30, x50);
t11 = v_sub(x31, x51);
v_float32x4 y00 = v_fma(t00, q5_25, v_sub(x00, x60));
v_float32x4 y01 = v_fma(t01, q5_25, v_sub(x01, x61));
v_float32x4 y70 = v_fma(t10, q5_25, v_sub(x70, x10));
v_float32x4 y71 = v_fma(t11, q5_25, v_sub(x71, x11));
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
v_float32x4 qm4_25 = v_setall_f32(-4.25f);
t00 = v_fma(x30, qm4_25, v_add(x10, x50));
t01 = v_fma(x31, qm4_25, v_add(x11, x51));
t10 = v_fma(x40, qm4_25, v_add(x20, x60));
t11 = v_fma(x41, qm4_25, v_add(x21, x61));
v_float32x4 y10 = v_add(t00, t10), y11 = v_add(t01, t11);
v_float32x4 y20 = v_sub(t10, t00), y21 = v_sub(t11, t01);
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f);
v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f);
t00 = v_fma(x10, q0_5, v_add(x50, x50));
t01 = v_fma(x11, q0_5, v_add(x51, x51));
t10 = v_fma(x20, q0_25, x60);
t11 = v_fma(x21, q0_25, x61);
t00 = v_fma(x30, qm2_5, t00);
t01 = v_fma(x31, qm2_5, t01);
t10 = v_fma(x40, qm1_25, t10);
t11 = v_fma(x41, qm1_25, t11);
v_float32x4 y30 = v_add(t00, t10), y31 = v_add(t01, t11);
v_float32x4 y40 = v_sub(t10, t00), y41 = v_sub(t11, t01);
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f);
t00 = v_fma(x50, q0_5, v_add(x10, x10));
t01 = v_fma(x51, q0_5, v_add(x11, x11));
t10 = v_fma(x20, q4 , x60);
t11 = v_fma(x21, q4 , x61);
t00 = v_fma(x30, qm2_5, t00);
t01 = v_fma(x31, qm2_5, t01);
t10 = v_fma(x40, qm5 , t10);
t11 = v_fma(x41, qm5 , t11);
v_float32x4 y50 = v_add(t00, t10), y51 = v_add(t01, t11);
v_float32x4 y60 = v_sub(t10, t00), y61 = v_sub(t11, t01);
/* transpose 8x8 matrix with v_transpose4x4 */
v_float32x4 y000, y100, y200, y300, y010, y110, y210, y310, y400, y500, y600, y700, y410, y510, y610, y710;
v_transpose4x4(y00, y10, y20, y30, y000, y100, y200, y300);
v_transpose4x4(y01, y11, y21, y31, y010, y110, y210, y310);
v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700);
v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710);
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
t00 = v_sub(y010, y200);
t01 = v_sub(y410, y600);
t10 = v_sub(y300, y110);
t11 = v_sub(y700, y510);
z00 = v_fma(t00, q5_25, v_sub(y000, y210));
z01 = v_fma(t01, q5_25, v_sub(y400, y610));
z70 = v_fma(t10, q5_25, v_sub(y310, y100));
z71 = v_fma(t11, q5_25, v_sub(y710, y500));
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
t00 = v_fma(y300, qm4_25, v_add(y100, y110));
t01 = v_fma(y700, qm4_25, v_add(y500, y510));
t10 = v_fma(y010, qm4_25, v_add(y200, y210));
t11 = v_fma(y410, qm4_25, v_add(y600, y610));
z10 = v_add(t00, t10); z11 = v_add(t01, t11);
z20 = v_sub(t10, t00); z21 = v_sub(t11, t01);
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
t00 = v_fma(y100, q0_5, v_add(y110, y110));
t01 = v_fma(y500, q0_5, v_add(y510, y510));
t10 = v_fma(y200, q0_25, y210);
t11 = v_fma(y600, q0_25, y610);
t00 = v_fma(y300, qm2_5, t00);
t01 = v_fma(y700, qm2_5, t01);
t10 = v_fma(y010, qm1_25, t10);
t11 = v_fma(y410, qm1_25, t11);
z30 = v_add(t00, t10); z31 = v_add(t01, t11);
z40 = v_sub(t10, t00); z41 = v_sub(t11, t01);
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
t00 = v_fma(y110, q0_5, v_add(y100, y100));
t01 = v_fma(y510, q0_5, v_add(y500, y500));
t10 = v_fma(y200, q4, y210);
t11 = v_fma(y600, q4, y610);
t00 = v_fma(y300, qm2_5, t00);
t01 = v_fma(y700, qm2_5, t01);
t10 = v_fma(y010, qm5, t10);
t11 = v_fma(y410, qm5, t11);
z50 = v_add(t00, t10); z51 = v_add(t01, t11);
z60 = v_sub(t10, t00); z61 = v_sub(t11, t01);
}
const int outstep = winoIblock*winoAtomF32*Cg;
v_store(outptr, z00);
v_store(outptr + outstep, z01);
v_store(outptr + outstep*2, z10);
v_store(outptr + outstep*3, z11);
v_store(outptr + outstep*4, z20);
v_store(outptr + outstep*5, z21);
v_store(outptr + outstep*6, z30);
v_store(outptr + outstep*7, z31);
v_store(outptr + outstep*8, z40);
v_store(outptr + outstep*9, z41);
v_store(outptr + outstep*10, z50);
v_store(outptr + outstep*11, z51);
v_store(outptr + outstep*12, z60);
v_store(outptr + outstep*13, z61);
v_store(outptr + outstep*14, z70);
v_store(outptr + outstep*15, z71);
}
/*Output transform*/
/* Inverse Winograd 8x8 transform:
out = (A'*inp*A)', where
inp is input 8x8 FP32 matrix,
A' is
[1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f,
0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f,
0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f,
0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f,
0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f,
0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f]
inp is pre-loaded into xij registers,
out will be stored in zij, where (0<=i<=7 for x, 0<=i<=5 for z), 0<=j<=1.
After the inverse transform is done, we add bias,
optionally add results from the earlier tensors (by-pass),
optionally apply activation function and then
store the final results.
That is, after both forward and then inverse transformation,
we get non-transposed result.
Of course, for the correct work of Winograd-based convolution,
the Winograd-transformed weights should also be transposed.
init_conv() (see OpConv.fx) takes care of that.
*/
void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct)
{
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4);
v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4);
v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4);
v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4);
v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4);
v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
{
v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
s12_0 = v_add(x10, x20); s12_1 = v_add(x11, x21);
s34_0 = v_add(x30, x40); s34_1 = v_add(x31, x41);
s56_0 = v_add(x50, x60); s56_1 = v_add(x51, x61);
v_float32x4 y00 = v_add(v_add(v_add(x00, s12_0), s34_0), s56_0);
v_float32x4 y01 = v_add(v_add(v_add(x01, s12_1), s34_1), s56_1);
v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
v_float32x4 y21 = v_fma(s56_1, a0 ,v_fma(s34_1, a1, s12_1) );
a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f);
v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
s12_0 = v_sub(x10, x20); s12_1 = v_sub(x11, x21);
s34_0 = v_sub(x30, x40); s34_1 = v_sub(x31, x41);
s56_0 = v_sub(x50, x60); s56_1 = v_sub(x51, x61);
a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f);
v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(x70, s12_0)));
v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(x71, s12_1)));
a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f);
v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
v_float32x4 y11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.f);
v_float32x4 y30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
v_float32x4 y31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
v_float32x4 y60 = v_setall_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
/* transpose 8x8 matrix with v_transpose4x4 */
v_float32x4 y000, y100, y200, y300, y010, y110, y210, y310, y400, y500, y600, y700, y410, y510, y610, y710;
v_transpose4x4(y00, y10, y20, y30, y000, y100, y200, y300);
v_transpose4x4(y01, y11, y21, y31, y010, y110, y210, y310);
v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700);
v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710);
s12_0 = v_add(y100, y200); s12_1 = v_add(y500, y600);
s34_0 = v_add(y300, y010); s34_1 = v_add(y700, y410);
s56_0 = v_add(y110, y210); s56_1 = v_add(y510, y610);
z00 = v_add(v_add(v_add(y000, s12_0), s34_0), s56_0);
z01 = v_add(v_add(v_add(y400, s12_1), s34_1), s56_1);
a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
z21 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f);
z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
s12_0 = v_sub(y100, y200); s12_1 = v_sub(y500, y600);
s34_0 = v_sub(y300, y010); s34_1 = v_sub(y700, y410);
s56_0 = v_sub(y110, y210); s56_1 = v_sub(y510, y610);
a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f);
z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(y310, s12_0)));
z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(y710, s12_1)));
a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f);
z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.0f);
z30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
v_float32x4 vbias = v_setall_f32(bias);
z00 = v_add(z00, vbias);
z01 = v_add(z01, vbias);
z10 = v_add(z10, vbias);
z11 = v_add(z11, vbias);
z20 = v_add(z20, vbias);
z21 = v_add(z21, vbias);
z30 = v_add(z30, vbias);
z31 = v_add(z31, vbias);
z40 = v_add(z40, vbias);
z41 = v_add(z41, vbias);
z50 = v_add(z50, vbias);
z51 = v_add(z51, vbias);
}
if (bpptr)
{
z00 = v_add(z00, v_load(bpptr));
z01 = v_add(z01, v_load_low(bpptr + 4));
z10 = v_add(z10, v_load(bpptr + bpstep));
z11 = v_add(z11, v_load_low(bpptr + bpstep + 4));
z20 = v_add(z20, v_load(bpptr + bpstep * 2));
z21 = v_add(z21, v_load_low(bpptr + bpstep * 2 + 4));
z30 = v_add(z30, v_load(bpptr + bpstep * 3));
z31 = v_add(z31, v_load_low(bpptr + bpstep * 3 + 4));
z40 = v_add(z40, v_load(bpptr + bpstep * 4));
z41 = v_add(z41, v_load_low(bpptr + bpstep * 4 + 4));
z50 = v_add(z50, v_load(bpptr + bpstep * 5));
z51 = v_add(z51, v_load_low(bpptr + bpstep * 5 + 4));
}
if (ifMinMaxAct)
{
v_float32x4 vmax = v_setall_f32(maxval);
v_float32x4 vmin = v_setall_f32(minval);
z00 = v_min(v_max(z00, vmin), vmax);
z01 = v_min(v_max(z01, vmin), vmax);
z10 = v_min(v_max(z10, vmin), vmax);
z11 = v_min(v_max(z11, vmin), vmax);
z20 = v_min(v_max(z20, vmin), vmax);
z21 = v_min(v_max(z21, vmin), vmax);
z30 = v_min(v_max(z30, vmin), vmax);
z31 = v_min(v_max(z31, vmin), vmax);
z40 = v_min(v_max(z40, vmin), vmax);
z41 = v_min(v_max(z41, vmin), vmax);
z50 = v_min(v_max(z50, vmin), vmax);
z51 = v_min(v_max(z51, vmin), vmax);
}
v_store(outptr, z00);
v_store_low(outptr + 4, z01);
v_store(outptr + outstep, z10);
v_store_low(outptr + outstep + 4, z11);
v_store(outptr + outstep*2, z20);
v_store_low(outptr + outstep*2 + 4, z21);
v_store(outptr + outstep*3, z30);
v_store_low(outptr + outstep*3 + 4, z31);
v_store(outptr + outstep*4, z40);
v_store_low(outptr + outstep*4 + 4, z41);
v_store(outptr + outstep*5, z50);
v_store_low(outptr + outstep*5 + 4, z51);
}
#endif
#else
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
{
return 0;
}
#endif
}} // namespace cv::dnn

View File

@ -0,0 +1,22 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "convolution.hpp"
#include "conv_winograd_f63.simd.hpp"
#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp"
namespace cv {
namespace dnn {
cv::dnn::Winofunc getWinofunc_F32()
{
CV_CPU_DISPATCH(getWinofunc_F32, (), CV_CPU_DISPATCH_MODES_ALL);
}
cv::dnn::Winofunc getWinofunc_F16()
{
CV_CPU_DISPATCH(getWinofunc_F16, (), CV_CPU_DISPATCH_MODES_ALL);
}
}} // namespace cv::dnn::

View File

@ -1,476 +0,0 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../../precomp.hpp"
#include "convolution.hpp"
#include "opencv2/core/hal/intrin.hpp"
namespace cv {
namespace dnn {
// NEON code work around.
namespace opt_NEON
{
#if CV_NEON && CV_NEON_AARCH64
/* Accumulate */
void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
{
CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 4);
if (iblock > 3)
{
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
outbuf += winoAtomF32)
{
float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
wptr += winoKblock*winoAtomF32) {
float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
float32x4_t x0, x1;
x0 = vld1q_f32(inwptr);
x1 = vld1q_f32(inwptr + 4);
s00 = vfmaq_f32(s00, w0, x0);
s01 = vfmaq_f32(s01, w0, x1);
s10 = vfmaq_f32(s10, w1, x0);
s11 = vfmaq_f32(s11, w1, x1);
s20 = vfmaq_f32(s20, w2, x0);
s21 = vfmaq_f32(s21, w2, x1);
s30 = vfmaq_f32(s30, w3, x0);
s31 = vfmaq_f32(s31, w3, x1);
x0 = vld1q_f32(inwptr + 8);
x1 = vld1q_f32(inwptr + 12);
s02 = vfmaq_f32(s02, w0, x0);
s03 = vfmaq_f32(s03, w0, x1);
s12 = vfmaq_f32(s12, w1, x0);
s13 = vfmaq_f32(s13, w1, x1);
s22 = vfmaq_f32(s22, w2, x0);
s23 = vfmaq_f32(s23, w2, x1);
s32 = vfmaq_f32(s32, w3, x0);
s33 = vfmaq_f32(s33, w3, x1);
x0 = vld1q_f32(inwptr + 16);
x1 = vld1q_f32(inwptr + 20);
s04 = vfmaq_f32(s04, w0, x0);
s05 = vfmaq_f32(s05, w0, x1);
s14 = vfmaq_f32(s14, w1, x0);
s15 = vfmaq_f32(s15, w1, x1);
s24 = vfmaq_f32(s24, w2, x0);
s25 = vfmaq_f32(s25, w2, x1);
s34 = vfmaq_f32(s34, w3, x0);
s35 = vfmaq_f32(s35, w3, x1);
}
vst1q_f32(outbuf, s00);
vst1q_f32(outbuf + 1*64, s01);
vst1q_f32(outbuf + 2*64, s02);
vst1q_f32(outbuf + 3*64, s03);
vst1q_f32(outbuf + 4*64, s04);
vst1q_f32(outbuf + 5*64, s05);
vst1q_f32(outbuf + 6*64, s10);
vst1q_f32(outbuf + 7*64, s11);
vst1q_f32(outbuf + 8*64, s12);
vst1q_f32(outbuf + 9*64, s13);
vst1q_f32(outbuf + 10*64, s14);
vst1q_f32(outbuf + 11*64, s15);
vst1q_f32(outbuf + 12*64, s20);
vst1q_f32(outbuf + 13*64, s21);
vst1q_f32(outbuf + 14*64, s22);
vst1q_f32(outbuf + 15*64, s23);
vst1q_f32(outbuf + 16*64, s24);
vst1q_f32(outbuf + 17*64, s25);
vst1q_f32(outbuf + 18*64, s30);
vst1q_f32(outbuf + 19*64, s31);
vst1q_f32(outbuf + 20*64, s32);
vst1q_f32(outbuf + 21*64, s33);
vst1q_f32(outbuf + 22*64, s34);
vst1q_f32(outbuf + 23*64, s35);
}
}
else
{
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
outbuf += winoAtomF32)
{
float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00;
float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00;
float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00;
float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00;
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
wptr += winoKblock*winoAtomF32) {
float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
float32x4_t x0, x1, x2;
x0 = vld1q_f32(inwptr);
x1 = vld1q_f32(inwptr + 4);
x2 = vld1q_f32(inwptr + 8);
s00 = vfmaq_f32(s00, w0, x0);
s01 = vfmaq_f32(s01, w0, x1);
s02 = vfmaq_f32(s02, w0, x2);
s10 = vfmaq_f32(s10, w1, x0);
s11 = vfmaq_f32(s11, w1, x1);
s12 = vfmaq_f32(s12, w1, x2);
s20 = vfmaq_f32(s20, w2, x0);
s21 = vfmaq_f32(s21, w2, x1);
s22 = vfmaq_f32(s22, w2, x2);
s30 = vfmaq_f32(s30, w3, x0);
s31 = vfmaq_f32(s31, w3, x1);
s32 = vfmaq_f32(s32, w3, x2);
}
vst1q_f32(outbuf, s00);
vst1q_f32(outbuf + 1*64, s01);
vst1q_f32(outbuf + 2*64, s02);
vst1q_f32(outbuf + 6*64, s10);
vst1q_f32(outbuf + 7*64, s11);
vst1q_f32(outbuf + 8*64, s12);
vst1q_f32(outbuf + 12*64, s20);
vst1q_f32(outbuf + 13*64, s21);
vst1q_f32(outbuf + 14*64, s22);
vst1q_f32(outbuf + 18*64, s30);
vst1q_f32(outbuf + 19*64, s31);
vst1q_f32(outbuf + 20*64, s32);
}
}
}
#undef T4x4
#define T4x4(a, b, c, d, tr0, tr1) \
tr0 = vtrnq_f32(a, b); \
tr1 = vtrnq_f32(c, d); \
a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \
b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \
c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \
d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1]))
/*Input transform*/
void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
{
float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
{
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11;
t00 = vsubq_f32(x40, x20);
t01 = vsubq_f32(x41, x21);
t10 = vsubq_f32(x30, x50);
t11 = vsubq_f32(x31, x51);
float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25);
float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25);
float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25);
float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25);
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
float32x4_t qm4_25 = vdupq_n_f32(-4.25f);
t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25);
t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25);
t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25);
t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25);
float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11);
float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01);
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f);
float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f);
t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5);
t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5);
t10 = vfmaq_f32(x60, x20, q0_25);
t11 = vfmaq_f32(x61, x21, q0_25);
t00 = vfmaq_f32(t00, x30, qm2_5);
t01 = vfmaq_f32(t01, x31, qm2_5);
t10 = vfmaq_f32(t10, x40, qm1_25);
t11 = vfmaq_f32(t11, x41, qm1_25);
float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11);
float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01);
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f);
t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5);
t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5);
t10 = vfmaq_f32(x60, x20, q4);
t11 = vfmaq_f32(x61, x21, q4);
t00 = vfmaq_f32(t00, x30, qm2_5);
t01 = vfmaq_f32(t01, x31, qm2_5);
t10 = vfmaq_f32(t10, x40, qm5);
t11 = vfmaq_f32(t11, x41, qm5);
float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11);
float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01);
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
/* Y: */
/* y00 y01 */
/* y10 y11 */
/* ... */
/* y70 y71 */
/* Y': */
/* y00 y40 */
/* y10 y50 */
/* y20 y60 */
/* y30 y70 */
/* y01 y41 */
/* y11 y51 */
/* y21 y61 */
/* y31 y71 */
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
float32x4x2_t tr0, tr1;
T4x4(y00, y10, y20, y30, tr0, tr1);
T4x4(y01, y11, y21, y31, tr0, tr1);
T4x4(y40, y50, y60, y70, tr0, tr1);
T4x4(y41, y51, y61, y71, tr0, tr1);
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
t00 = vsubq_f32(y01, y20);
t01 = vsubq_f32(y41, y60);
t10 = vsubq_f32(y30, y11);
t11 = vsubq_f32(y70, y51);
z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25);
z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25);
z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25);
z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25);
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25);
t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25);
t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25);
t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25);
z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11);
z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01);
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5);
t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5);
t10 = vfmaq_f32(y21, y20, q0_25);
t11 = vfmaq_f32(y61, y60, q0_25);
t00 = vfmaq_f32(t00, y30, qm2_5);
t01 = vfmaq_f32(t01, y70, qm2_5);
t10 = vfmaq_f32(t10, y01, qm1_25);
t11 = vfmaq_f32(t11, y41, qm1_25);
z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11);
z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01);
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5);
t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5);
t10 = vfmaq_f32(y21, y20, q4);
t11 = vfmaq_f32(y61, y60, q4);
t00 = vfmaq_f32(t00, y30, qm2_5);
t01 = vfmaq_f32(t01, y70, qm2_5);
t10 = vfmaq_f32(t10, y01, qm5);
t11 = vfmaq_f32(t11, y41, qm5);
z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11);
z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01);
}
const int outstep = winoIblock*winoAtomF32*Cg;
vst1q_f32(outptr, z00);
vst1q_f32(outptr + outstep, z01);
vst1q_f32(outptr + outstep*2, z10);
vst1q_f32(outptr + outstep*3, z11);
vst1q_f32(outptr + outstep*4, z20);
vst1q_f32(outptr + outstep*5, z21);
vst1q_f32(outptr + outstep*6, z30);
vst1q_f32(outptr + outstep*7, z31);
vst1q_f32(outptr + outstep*8, z40);
vst1q_f32(outptr + outstep*9, z41);
vst1q_f32(outptr + outstep*10, z50);
vst1q_f32(outptr + outstep*11, z51);
vst1q_f32(outptr + outstep*12, z60);
vst1q_f32(outptr + outstep*13, z61);
vst1q_f32(outptr + outstep*14, z70);
vst1q_f32(outptr + outstep*15, z71);
}
/*Output transform*/
void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct)
{
float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
{
float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21);
s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41);
s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61);
float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0);
float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1);
float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21);
s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41);
s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61);
float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0),
s34_0, 32.f), s56_0, 1.f/32);
float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1),
s34_1, 32.f), s56_1, 1.f/32);
float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
/* Y: */
/* y00 y01 */
/* y10 y11 */
/* ... */
/* y50 y51 */
/* 0 0 */
/* 0 0 */
/* Y': */
/* y00 y40 */
/* y10 y50 */
/* y20 y60 */
/* y30 y70 */
/* y01 y41 */
/* y11 y51 */
/* y21 y61 */
/* y31 y71 */
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
float32x4x2_t tr0, tr1;
T4x4(y00, y10, y20, y30, tr0, tr1);
T4x4(y01, y11, y21, y31, tr0, tr1);
T4x4(y40, y50, y60, y70, tr0, tr1);
T4x4(y41, y51, y61, y71, tr0, tr1);
s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60);
s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41);
s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61);
z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0);
z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1);
z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60);
s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41);
s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61);
z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0),
s34_0, 32.f), s56_0, 1.f/32);
z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1),
s34_1, 32.f), s56_1, 1.f/32);
z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
float32x4_t vbias = vdupq_n_f32(bias);
z00 = vaddq_f32(z00, vbias);
z01 = vaddq_f32(z01, vbias);
z10 = vaddq_f32(z10, vbias);
z11 = vaddq_f32(z11, vbias);
z20 = vaddq_f32(z20, vbias);
z21 = vaddq_f32(z21, vbias);
z30 = vaddq_f32(z30, vbias);
z31 = vaddq_f32(z31, vbias);
z40 = vaddq_f32(z40, vbias);
z41 = vaddq_f32(z41, vbias);
z50 = vaddq_f32(z50, vbias);
z51 = vaddq_f32(z51, vbias);
}
if (bpptr)
{
float32x2_t zhalf = vdup_n_f32(0.f);
z00 = vaddq_f32(z00, vld1q_f32(bpptr));
z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf));
z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep));
z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf));
z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2));
z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf));
z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3));
z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf));
z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4));
z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf));
z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5));
z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf));
}
if (ifMinMaxAct)
{
float32x4_t vmax = vdupq_n_f32(maxval);
float32x4_t vmin = vdupq_n_f32(minval);
z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax);
z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax);
z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax);
z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax);
z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax);
z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax);
z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax);
z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax);
z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax);
z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax);
z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax);
z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax);
}
vst1q_f32(outptr, z00);
vst1_f32(outptr + 4, vget_low_f32(z01));
vst1q_f32(outptr + outstep, z10);
vst1_f32(outptr + outstep + 4, vget_low_f32(z11));
vst1q_f32(outptr + outstep*2, z20);
vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21));
vst1q_f32(outptr + outstep*3, z30);
vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31));
vst1q_f32(outptr + outstep*4, z40);
vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41));
vst1q_f32(outptr + outstep*5, z50);
vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51));
}
#endif
}
}} // namespace

File diff suppressed because it is too large Load Diff

View File

@ -6,6 +6,7 @@
#define OPENCV_FAST_CONVOLUTION_HPP
#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/dnn/all_layers.hpp"
#ifndef CONV_PRAM
#define CONV_PRAM
@ -119,25 +120,30 @@ void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bo
void convBlockMR1_F32(int np, const float* a, const float* b, float* c, const float bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR);
#if CV_NEON_AARCH64
/* Accumulate */
void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
const int winoIblock, const int winoKblock, const int winoAtom, const int winoNatom);
/*Input transform*/
void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
float* outptr, int Cg, const int winoIblock, const int winoAtom);
/*Output transform*/
void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct);
#endif // CV_NEON_AARCH64
#endif // CV_NEON
} // namespace opt_NEON.
// === Function tables
struct Winofunc
{
void (*accum)(const uchar* inwptr, const uchar* wptr, uchar* outbuf, int Cg, int iblock, const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
void (*BtXB_8x8)(const float* inptr, int inpstep, uchar* outptr, int Cg, const int winoIblock, const int winoAtomF32);
void (*AtXA_8x8)(const uchar* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep, float bias, float minval, float maxval, bool ifMinMaxAct);
int iblock;
int natom;
int esz;
bool isGood() const { return accum && BtXB_8x8 && AtXA_8x8 && iblock > 0 && natom > 0 && esz > 0; }
static Winofunc empty() { return {0, 0, 0, 0, 0, 0}; }
};
// === wrapper calls (implemented in .dispatch.cpp)
Winofunc getWinofunc_F32();
Winofunc getWinofunc_F16();
} // namespace dnn
} // namespace cv

View File

@ -1,235 +0,0 @@
/***********************************************************************
* Software License Agreement (BSD License)
*
* Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
* Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*************************************************************************/
#ifndef OPENCV_FLANN_HDF5_H_
#define OPENCV_FLANN_HDF5_H_
//! @cond IGNORED
#include <hdf5.h>
#include "matrix.h"
namespace cvflann
{
namespace
{
template<typename T>
hid_t get_hdf5_type()
{
throw FLANNException("Unsupported type for IO operations");
}
template<>
hid_t get_hdf5_type<char>() { return H5T_NATIVE_CHAR; }
template<>
hid_t get_hdf5_type<unsigned char>() { return H5T_NATIVE_UCHAR; }
template<>
hid_t get_hdf5_type<short int>() { return H5T_NATIVE_SHORT; }
template<>
hid_t get_hdf5_type<unsigned short int>() { return H5T_NATIVE_USHORT; }
template<>
hid_t get_hdf5_type<int>() { return H5T_NATIVE_INT; }
template<>
hid_t get_hdf5_type<unsigned int>() { return H5T_NATIVE_UINT; }
template<>
hid_t get_hdf5_type<long>() { return H5T_NATIVE_LONG; }
template<>
hid_t get_hdf5_type<unsigned long>() { return H5T_NATIVE_ULONG; }
template<>
hid_t get_hdf5_type<float>() { return H5T_NATIVE_FLOAT; }
template<>
hid_t get_hdf5_type<double>() { return H5T_NATIVE_DOUBLE; }
}
#define CHECK_ERROR(x,y) if ((x)<0) throw FLANNException((y));
template<typename T>
void save_to_file(const cvflann::Matrix<T>& dataset, const String& filename, const String& name)
{
#if H5Eset_auto_vers == 2
H5Eset_auto( H5E_DEFAULT, NULL, NULL );
#else
H5Eset_auto( NULL, NULL );
#endif
herr_t status;
hid_t file_id;
file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
if (file_id < 0) {
file_id = H5Fcreate(filename.c_str(), H5F_ACC_EXCL, H5P_DEFAULT, H5P_DEFAULT);
}
CHECK_ERROR(file_id,"Error creating hdf5 file.");
hsize_t dimsf[2]; // dataset dimensions
dimsf[0] = dataset.rows;
dimsf[1] = dataset.cols;
hid_t space_id = H5Screate_simple(2, dimsf, NULL);
hid_t memspace_id = H5Screate_simple(2, dimsf, NULL);
hid_t dataset_id;
#if H5Dcreate_vers == 2
dataset_id = H5Dcreate2(file_id, name.c_str(), get_hdf5_type<T>(), space_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
#else
dataset_id = H5Dcreate(file_id, name.c_str(), get_hdf5_type<T>(), space_id, H5P_DEFAULT);
#endif
if (dataset_id<0) {
#if H5Dopen_vers == 2
dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
#else
dataset_id = H5Dopen(file_id, name.c_str());
#endif
}
CHECK_ERROR(dataset_id,"Error creating or opening dataset in file.");
status = H5Dwrite(dataset_id, get_hdf5_type<T>(), memspace_id, space_id, H5P_DEFAULT, dataset.data );
CHECK_ERROR(status, "Error writing to dataset");
H5Sclose(memspace_id);
H5Sclose(space_id);
H5Dclose(dataset_id);
H5Fclose(file_id);
}
template<typename T>
void load_from_file(cvflann::Matrix<T>& dataset, const String& filename, const String& name)
{
herr_t status;
hid_t file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
CHECK_ERROR(file_id,"Error opening hdf5 file.");
hid_t dataset_id;
#if H5Dopen_vers == 2
dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
#else
dataset_id = H5Dopen(file_id, name.c_str());
#endif
CHECK_ERROR(dataset_id,"Error opening dataset in file.");
hid_t space_id = H5Dget_space(dataset_id);
hsize_t dims_out[2];
H5Sget_simple_extent_dims(space_id, dims_out, NULL);
dataset = cvflann::Matrix<T>(new T[dims_out[0]*dims_out[1]], dims_out[0], dims_out[1]);
status = H5Dread(dataset_id, get_hdf5_type<T>(), H5S_ALL, H5S_ALL, H5P_DEFAULT, dataset[0]);
CHECK_ERROR(status, "Error reading dataset");
H5Sclose(space_id);
H5Dclose(dataset_id);
H5Fclose(file_id);
}
#ifdef HAVE_MPI
namespace mpi
{
/**
* Loads a the hyperslice corresponding to this processor from a hdf5 file.
* @param flann_dataset Dataset where the data is loaded
* @param filename HDF5 file name
* @param name Name of dataset inside file
*/
template<typename T>
void load_from_file(cvflann::Matrix<T>& dataset, const String& filename, const String& name)
{
MPI_Comm comm = MPI_COMM_WORLD;
MPI_Info info = MPI_INFO_NULL;
int mpi_size, mpi_rank;
MPI_Comm_size(comm, &mpi_size);
MPI_Comm_rank(comm, &mpi_rank);
herr_t status;
hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
H5Pset_fapl_mpio(plist_id, comm, info);
hid_t file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, plist_id);
CHECK_ERROR(file_id,"Error opening hdf5 file.");
H5Pclose(plist_id);
hid_t dataset_id;
#if H5Dopen_vers == 2
dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
#else
dataset_id = H5Dopen(file_id, name.c_str());
#endif
CHECK_ERROR(dataset_id,"Error opening dataset in file.");
hid_t space_id = H5Dget_space(dataset_id);
hsize_t dims[2];
H5Sget_simple_extent_dims(space_id, dims, NULL);
hsize_t count[2];
hsize_t offset[2];
hsize_t item_cnt = dims[0]/mpi_size+(dims[0]%mpi_size==0 ? 0 : 1);
hsize_t cnt = (mpi_rank<mpi_size-1 ? item_cnt : dims[0]-item_cnt*(mpi_size-1));
count[0] = cnt;
count[1] = dims[1];
offset[0] = mpi_rank*item_cnt;
offset[1] = 0;
hid_t memspace_id = H5Screate_simple(2,count,NULL);
H5Sselect_hyperslab(space_id, H5S_SELECT_SET, offset, NULL, count, NULL);
dataset.rows = count[0];
dataset.cols = count[1];
dataset.data = new T[dataset.rows*dataset.cols];
plist_id = H5Pcreate(H5P_DATASET_XFER);
H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
status = H5Dread(dataset_id, get_hdf5_type<T>(), memspace_id, space_id, plist_id, dataset.data);
CHECK_ERROR(status, "Error reading dataset");
H5Pclose(plist_id);
H5Sclose(space_id);
H5Sclose(memspace_id);
H5Dclose(dataset_id);
H5Fclose(file_id);
}
}
#endif // HAVE_MPI
} // namespace cvflann::mpi
//! @endcond
#endif /* OPENCV_FLANN_HDF5_H_ */

View File

@ -190,7 +190,7 @@ endif()
if(TARGET opencv_test_imgcodecs AND HAVE_OPENEXR AND "$ENV{OPENCV_IO_ENABLE_OPENEXR}")
ocv_target_compile_definitions(opencv_test_imgcodecs PRIVATE OPENCV_IMGCODECS_ENABLE_OPENEXR_TESTS=1)
endif()
if(TARGET opencv_test_imgcodecs AND ((HAVE_PNG AND NOT (PNG_VERSION VERSION_LESS "1.6.31")) OR HAVE_SPNG))
if(TARGET opencv_test_imgcodecs AND ((HAVE_PNG AND NOT (PNG_VERSION_STRING VERSION_LESS "1.6.31")) OR HAVE_SPNG))
# details: https://github.com/glennrp/libpng/commit/68cb0aaee3de6371b81a4613476d9b33e43e95b1
ocv_target_compile_definitions(opencv_test_imgcodecs PRIVATE OPENCV_IMGCODECS_PNG_WITH_EXIF=1)
endif()

View File

@ -754,7 +754,10 @@ bool ExrEncoder::write( const Mat& img, const std::vector<int>& params )
case IMWRITE_EXR_COMPRESSION_B44A:
header.compression() = B44A_COMPRESSION;
break;
#if ((OPENEXR_VERSION_MAJOR * 1000 + OPENEXR_VERSION_MINOR) >= (2 * 1000 + 2)) // available since version 2.2.0
// version macros introduced in openexr 2.0.1.
// - https://github.com/AcademySoftwareFoundation/openexr/commit/60cdff8a6f5c4e25a374e5f366d6e9b4efd869b3#diff-c4bae0726aebe410e407db9abd406d9cf2684f82dd8a08f46d84e8b7c35cf22aR67
#if defined(OPENEXR_VERSION_MAJOR) && defined(OPENEXR_VERSION_MINOR) && OPENEXR_VERSION_MAJOR * 1000 + OPENEXR_VERSION_MINOR >= 2 * 1000 + 2
// available since version 2.2.0
case IMWRITE_EXR_COMPRESSION_DWAA:
header.compression() = DWAA_COMPRESSION;
break;
@ -768,10 +771,12 @@ bool ExrEncoder::write( const Mat& img, const std::vector<int>& params )
}
if (params[i] == IMWRITE_EXR_DWA_COMPRESSION_LEVEL)
{
#if OPENEXR_VERSION_MAJOR >= 3
header.dwaCompressionLevel() = params[i + 1];
#else
#if !defined(OPENEXR_VERSION_MAJOR)
CV_LOG_ONCE_WARNING(NULL, "Setting `IMWRITE_EXR_DWA_COMPRESSION_LEVEL` not supported in unknown OpenEXR version possibly prior to 2.0.1 (version 3 is required)");
#elif OPENEXR_VERSION_MAJOR < 3
CV_LOG_ONCE_WARNING(NULL, "Setting `IMWRITE_EXR_DWA_COMPRESSION_LEVEL` not supported in OpenEXR version " + std::to_string(OPENEXR_VERSION_MAJOR) + " (version 3 is required)");
#else
header.dwaCompressionLevel() = params[i + 1];
#endif
}
}

View File

@ -83,6 +83,9 @@ static Size validateInputImageSize(const Size& size)
static inline int calcType(int type, int flags)
{
if ( (flags & (IMREAD_COLOR | IMREAD_ANYCOLOR | IMREAD_ANYDEPTH)) == (IMREAD_COLOR | IMREAD_ANYCOLOR | IMREAD_ANYDEPTH))
return type;
if( (flags & IMREAD_LOAD_GDAL) != IMREAD_LOAD_GDAL && flags != IMREAD_UNCHANGED )
{
if( (flags & IMREAD_ANYDEPTH) == 0 )

View File

@ -187,51 +187,6 @@ INSTANTIATE_TEST_CASE_P(
////////////////////////////////////////////////////////////////////////////////
typedef testing::TestWithParam<string> Imgcodecs_AVIF_Exif;
TEST_P(Imgcodecs_AVIF_Exif, exif_orientation) {
const string root = cvtest::TS::ptr()->get_data_path();
const string filename = root + GetParam();
const int colorThresholdHigh = 250;
const int colorThresholdLow = 5;
Mat m_img = imread(filename);
ASSERT_FALSE(m_img.empty());
Vec3b vec;
// Checking the first quadrant (with supposed red)
vec = m_img.at<Vec3b>(2, 2); // some point inside the square
EXPECT_LE(vec.val[0], colorThresholdLow);
EXPECT_LE(vec.val[1], colorThresholdLow);
EXPECT_GE(vec.val[2], colorThresholdHigh);
// Checking the second quadrant (with supposed green)
vec = m_img.at<Vec3b>(2, 7); // some point inside the square
EXPECT_LE(vec.val[0], colorThresholdLow);
EXPECT_GE(vec.val[1], colorThresholdHigh);
EXPECT_LE(vec.val[2], colorThresholdLow);
// Checking the third quadrant (with supposed blue)
vec = m_img.at<Vec3b>(7, 2); // some point inside the square
EXPECT_GE(vec.val[0], colorThresholdHigh);
EXPECT_LE(vec.val[1], colorThresholdLow);
EXPECT_LE(vec.val[2], colorThresholdLow);
}
const string exif_files[] = {"readwrite/testExifOrientation_1.avif",
"readwrite/testExifOrientation_2.avif",
"readwrite/testExifOrientation_3.avif",
"readwrite/testExifOrientation_4.avif",
"readwrite/testExifOrientation_5.avif",
"readwrite/testExifOrientation_6.avif",
"readwrite/testExifOrientation_7.avif",
"readwrite/testExifOrientation_8.avif"};
INSTANTIATE_TEST_CASE_P(ExifFiles, Imgcodecs_AVIF_Exif,
testing::ValuesIn(exif_files));
////////////////////////////////////////////////////////////////////////////////
class Imgcodecs_Avif_Animation_RoundTripSuite
: public Imgcodecs_Avif_RoundTripSuite {
public:

View File

@ -0,0 +1,151 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level
// directory of this distribution and at http://opencv.org/license.html
#include "test_precomp.hpp"
namespace opencv_test { namespace {
/**
* Test to check whether the EXIF orientation tag was processed successfully or not.
* The test uses a set of 8 images named testExifOrientation_{1 to 8}.(extension).
* Each test image is a 10x10 square, divided into four smaller sub-squares:
* (R corresponds to Red, G to Green, B to Blue, W to White)
* --------- ---------
* | R | G | | G | R |
* |-------| - (tag 1) |-------| - (tag 2)
* | B | W | | W | B |
* --------- ---------
*
* --------- ---------
* | W | B | | B | W |
* |-------| - (tag 3) |-------| - (tag 4)
* | G | R | | R | G |
* --------- ---------
*
* --------- ---------
* | R | B | | G | W |
* |-------| - (tag 5) |-------| - (tag 6)
* | G | W | | R | B |
* --------- ---------
*
* --------- ---------
* | W | G | | B | R |
* |-------| - (tag 7) |-------| - (tag 8)
* | B | R | | W | G |
* --------- ---------
*
*
* Each image contains an EXIF field with an orientation tag (0x112).
* After reading each image and applying the orientation tag,
* the resulting image should be:
* ---------
* | R | G |
* |-------|
* | B | W |
* ---------
*
* Note:
* The flags parameter of the imread function is set as IMREAD_COLOR | IMREAD_ANYCOLOR | IMREAD_ANYDEPTH.
* Using this combination is an undocumented trick to load images similarly to the IMREAD_UNCHANGED flag,
* preserving the alpha channel (if present) while also applying the orientation.
*/
typedef testing::TestWithParam<string> Exif;
TEST_P(Exif, exif_orientation)
{
const string root = cvtest::TS::ptr()->get_data_path();
const string filename = root + GetParam();
const int colorThresholdHigh = 250;
const int colorThresholdLow = 5;
// Refer to the note in the explanation above.
Mat m_img = imread(filename, IMREAD_COLOR | IMREAD_ANYCOLOR | IMREAD_ANYDEPTH);
ASSERT_FALSE(m_img.empty());
if (m_img.channels() == 3)
{
Vec3b vec;
//Checking the first quadrant (with supposed red)
vec = m_img.at<Vec3b>(2, 2); //some point inside the square
EXPECT_LE(vec.val[0], colorThresholdLow);
EXPECT_LE(vec.val[1], colorThresholdLow);
EXPECT_GE(vec.val[2], colorThresholdHigh);
//Checking the second quadrant (with supposed green)
vec = m_img.at<Vec3b>(2, 7); //some point inside the square
EXPECT_LE(vec.val[0], colorThresholdLow);
EXPECT_GE(vec.val[1], colorThresholdHigh);
EXPECT_LE(vec.val[2], colorThresholdLow);
//Checking the third quadrant (with supposed blue)
vec = m_img.at<Vec3b>(7, 2); //some point inside the square
EXPECT_GE(vec.val[0], colorThresholdHigh);
EXPECT_LE(vec.val[1], colorThresholdLow);
EXPECT_LE(vec.val[2], colorThresholdLow);
}
else
{
Vec4b vec;
//Checking the first quadrant (with supposed red)
vec = m_img.at<Vec4b>(2, 2); //some point inside the square
EXPECT_LE(vec.val[0], colorThresholdLow);
EXPECT_LE(vec.val[1], colorThresholdLow);
EXPECT_GE(vec.val[2], colorThresholdHigh);
//Checking the second quadrant (with supposed green)
vec = m_img.at<Vec4b>(2, 7); //some point inside the square
EXPECT_LE(vec.val[0], colorThresholdLow);
EXPECT_GE(vec.val[1], colorThresholdHigh);
EXPECT_LE(vec.val[2], colorThresholdLow);
//Checking the third quadrant (with supposed blue)
vec = m_img.at<Vec4b>(7, 2); //some point inside the square
EXPECT_GE(vec.val[0], colorThresholdHigh);
EXPECT_LE(vec.val[1], colorThresholdLow);
EXPECT_LE(vec.val[2], colorThresholdLow);
}
}
const string exif_files[] =
{
#ifdef HAVE_JPEG
"readwrite/testExifOrientation_1.jpg",
"readwrite/testExifOrientation_2.jpg",
"readwrite/testExifOrientation_3.jpg",
"readwrite/testExifOrientation_4.jpg",
"readwrite/testExifOrientation_5.jpg",
"readwrite/testExifOrientation_6.jpg",
"readwrite/testExifOrientation_7.jpg",
"readwrite/testExifOrientation_8.jpg",
#endif
#ifdef OPENCV_IMGCODECS_PNG_WITH_EXIF
"readwrite/testExifOrientation_1.png",
"readwrite/testExifOrientation_2.png",
"readwrite/testExifOrientation_3.png",
"readwrite/testExifOrientation_4.png",
"readwrite/testExifOrientation_5.png",
"readwrite/testExifOrientation_6.png",
"readwrite/testExifOrientation_7.png",
"readwrite/testExifOrientation_8.png",
#endif
#ifdef HAVE_AVIF
"readwrite/testExifOrientation_1.avif",
"readwrite/testExifOrientation_2.avif",
"readwrite/testExifOrientation_3.avif",
"readwrite/testExifOrientation_4.avif",
"readwrite/testExifOrientation_5.avif",
"readwrite/testExifOrientation_6.avif",
"readwrite/testExifOrientation_7.avif",
"readwrite/testExifOrientation_8.avif",
#endif
};
INSTANTIATE_TEST_CASE_P(Imgcodecs, Exif,
testing::ValuesIn(exif_files));
}
}

View File

@ -11,95 +11,6 @@ extern "C" {
#include "jpeglib.h"
}
/**
* Test for check whether reading exif orientation tag was processed successfully or not
* The test info is the set of 8 images named testExifRotate_{1 to 8}.jpg
* The test image is the square 10x10 points divided by four sub-squares:
* (R corresponds to Red, G to Green, B to Blue, W to white)
* --------- ---------
* | R | G | | G | R |
* |-------| - (tag 1) |-------| - (tag 2)
* | B | W | | W | B |
* --------- ---------
*
* --------- ---------
* | W | B | | B | W |
* |-------| - (tag 3) |-------| - (tag 4)
* | G | R | | R | G |
* --------- ---------
*
* --------- ---------
* | R | B | | G | W |
* |-------| - (tag 5) |-------| - (tag 6)
* | G | W | | R | B |
* --------- ---------
*
* --------- ---------
* | W | G | | B | R |
* |-------| - (tag 7) |-------| - (tag 8)
* | B | R | | W | G |
* --------- ---------
*
*
* Every image contains exif field with orientation tag (0x112)
* After reading each image the corresponding matrix must be read as
* ---------
* | R | G |
* |-------|
* | B | W |
* ---------
*
*/
typedef testing::TestWithParam<string> Imgcodecs_Jpeg_Exif;
TEST_P(Imgcodecs_Jpeg_Exif, exif_orientation)
{
const string root = cvtest::TS::ptr()->get_data_path();
const string filename = root + GetParam();
const int colorThresholdHigh = 250;
const int colorThresholdLow = 5;
Mat m_img = imread(filename);
ASSERT_FALSE(m_img.empty());
Vec3b vec;
//Checking the first quadrant (with supposed red)
vec = m_img.at<Vec3b>(2, 2); //some point inside the square
EXPECT_LE(vec.val[0], colorThresholdLow);
EXPECT_LE(vec.val[1], colorThresholdLow);
EXPECT_GE(vec.val[2], colorThresholdHigh);
//Checking the second quadrant (with supposed green)
vec = m_img.at<Vec3b>(2, 7); //some point inside the square
EXPECT_LE(vec.val[0], colorThresholdLow);
EXPECT_GE(vec.val[1], colorThresholdHigh);
EXPECT_LE(vec.val[2], colorThresholdLow);
//Checking the third quadrant (with supposed blue)
vec = m_img.at<Vec3b>(7, 2); //some point inside the square
EXPECT_GE(vec.val[0], colorThresholdHigh);
EXPECT_LE(vec.val[1], colorThresholdLow);
EXPECT_LE(vec.val[2], colorThresholdLow);
}
const string exif_files[] =
{
"readwrite/testExifOrientation_1.jpg",
"readwrite/testExifOrientation_2.jpg",
"readwrite/testExifOrientation_3.jpg",
"readwrite/testExifOrientation_4.jpg",
"readwrite/testExifOrientation_5.jpg",
"readwrite/testExifOrientation_6.jpg",
"readwrite/testExifOrientation_7.jpg",
"readwrite/testExifOrientation_8.jpg"
};
INSTANTIATE_TEST_CASE_P(ExifFiles, Imgcodecs_Jpeg_Exif,
testing::ValuesIn(exif_files));
//==================================================================================================
TEST(Imgcodecs_Jpeg, encode_empty)
{
cv::Mat img;

View File

@ -109,100 +109,6 @@ TEST(Imgcodecs_Png, read_color_palette_with_alpha)
EXPECT_EQ(img.at<Vec3b>(0, 1), Vec3b(255, 0, 0));
}
/**
* Test for check whether reading exif orientation tag was processed successfully or not
* The test info is the set of 8 images named testExifRotate_{1 to 8}.png
* The test image is the square 10x10 points divided by four sub-squares:
* (R corresponds to Red, G to Green, B to Blue, W to white)
* --------- ---------
* | R | G | | G | R |
* |-------| - (tag 1) |-------| - (tag 2)
* | B | W | | W | B |
* --------- ---------
*
* --------- ---------
* | W | B | | B | W |
* |-------| - (tag 3) |-------| - (tag 4)
* | G | R | | R | G |
* --------- ---------
*
* --------- ---------
* | R | B | | G | W |
* |-------| - (tag 5) |-------| - (tag 6)
* | G | W | | R | B |
* --------- ---------
*
* --------- ---------
* | W | G | | B | R |
* |-------| - (tag 7) |-------| - (tag 8)
* | B | R | | W | G |
* --------- ---------
*
*
* Every image contains exif field with orientation tag (0x112)
* After reading each image and applying the orientation tag,
* the resulting image should be:
* ---------
* | R | G |
* |-------|
* | B | W |
* ---------
*
*/
typedef testing::TestWithParam<string> Imgcodecs_PNG_Exif;
// Solution to issue 16579: PNG read doesn't support Exif orientation data
#ifdef OPENCV_IMGCODECS_PNG_WITH_EXIF
TEST_P(Imgcodecs_PNG_Exif, exif_orientation)
#else
TEST_P(Imgcodecs_PNG_Exif, DISABLED_exif_orientation)
#endif
{
const string root = cvtest::TS::ptr()->get_data_path();
const string filename = root + GetParam();
const int colorThresholdHigh = 250;
const int colorThresholdLow = 5;
Mat m_img = imread(filename);
ASSERT_FALSE(m_img.empty());
Vec3b vec;
//Checking the first quadrant (with supposed red)
vec = m_img.at<Vec3b>(2, 2); //some point inside the square
EXPECT_LE(vec.val[0], colorThresholdLow);
EXPECT_LE(vec.val[1], colorThresholdLow);
EXPECT_GE(vec.val[2], colorThresholdHigh);
//Checking the second quadrant (with supposed green)
vec = m_img.at<Vec3b>(2, 7); //some point inside the square
EXPECT_LE(vec.val[0], colorThresholdLow);
EXPECT_GE(vec.val[1], colorThresholdHigh);
EXPECT_LE(vec.val[2], colorThresholdLow);
//Checking the third quadrant (with supposed blue)
vec = m_img.at<Vec3b>(7, 2); //some point inside the square
EXPECT_GE(vec.val[0], colorThresholdHigh);
EXPECT_LE(vec.val[1], colorThresholdLow);
EXPECT_LE(vec.val[2], colorThresholdLow);
}
const string exif_files[] =
{
"readwrite/testExifOrientation_1.png",
"readwrite/testExifOrientation_2.png",
"readwrite/testExifOrientation_3.png",
"readwrite/testExifOrientation_4.png",
"readwrite/testExifOrientation_5.png",
"readwrite/testExifOrientation_6.png",
"readwrite/testExifOrientation_7.png",
"readwrite/testExifOrientation_8.png"
};
INSTANTIATE_TEST_CASE_P(ExifFiles, Imgcodecs_PNG_Exif,
testing::ValuesIn(exif_files));
typedef testing::TestWithParam<string> Imgcodecs_Png_PngSuite;
TEST_P(Imgcodecs_Png_PngSuite, decode)