mirror of
https://github.com/opencv/opencv.git
synced 2025-01-19 15:04:01 +08:00
Merge branch 4.x
This commit is contained in:
commit
7808d50412
23
3rdparty/kleidicv/CMakeLists.txt
vendored
23
3rdparty/kleidicv/CMakeLists.txt
vendored
@ -1,24 +1,7 @@
|
||||
project(kleidicv_hal)
|
||||
|
||||
set(KLEIDICV_SOURCE_PATH "" CACHE PATH "Directory containing KleidiCV sources")
|
||||
ocv_update(KLEIDICV_SRC_COMMIT "0.2.0")
|
||||
ocv_update(KLEIDICV_SRC_HASH "dabe522e8f55ac342d07a787391dab80")
|
||||
|
||||
if(KLEIDICV_SOURCE_PATH)
|
||||
set(THE_ROOT "${KLEIDICV_SOURCE_PATH}")
|
||||
else()
|
||||
ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
|
||||
HASH ${KLEIDICV_SRC_HASH}
|
||||
URL
|
||||
"${OPENCV_KLEIDICV_URL}"
|
||||
"$ENV{OPENCV_KLEIDICV_URL}"
|
||||
"https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
|
||||
DESTINATION_DIR "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/"
|
||||
ID KLEIDICV
|
||||
STATUS res
|
||||
UNPACK RELATIVE_URL)
|
||||
set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}")
|
||||
if(HAVE_KLEIDICV)
|
||||
option(KLEIDICV_ENABLE_SME2 "" OFF) # not compatible with some CLang versions in NDK
|
||||
include("${KLEIDICV_SOURCE_PATH}/adapters/opencv/CMakeLists.txt")
|
||||
endif()
|
||||
|
||||
option(KLEIDICV_ENABLE_SME2 "" OFF) # not compatible with some CLang versions in NDK
|
||||
include("${THE_ROOT}/adapters/opencv/CMakeLists.txt")
|
||||
|
21
3rdparty/kleidicv/kleidicv.cmake
vendored
Normal file
21
3rdparty/kleidicv/kleidicv.cmake
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
function(download_kleidicv root_var)
|
||||
set(${root_var} "" PARENT_SCOPE)
|
||||
|
||||
ocv_update(KLEIDICV_SRC_COMMIT "0.2.0")
|
||||
ocv_update(KLEIDICV_SRC_HASH "dabe522e8f55ac342d07a787391dab80")
|
||||
|
||||
set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv")
|
||||
ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
|
||||
HASH ${KLEIDICV_SRC_HASH}
|
||||
URL
|
||||
"${OPENCV_KLEIDICV_URL}"
|
||||
"$ENV{OPENCV_KLEIDICV_URL}"
|
||||
"https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
|
||||
DESTINATION_DIR ${THE_ROOT}
|
||||
ID KLEIDICV
|
||||
STATUS res
|
||||
UNPACK RELATIVE_URL)
|
||||
if(res)
|
||||
set(${root_var} "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}" PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
@ -861,7 +861,7 @@ if(NOT DEFINED OpenCV_HAL)
|
||||
set(OpenCV_HAL "OpenCV_HAL")
|
||||
endif()
|
||||
|
||||
if(WITH_KLEIDICV)
|
||||
if(HAVE_KLEIDICV)
|
||||
ocv_debug_message(STATUS "Enable KleidiCV acceleration")
|
||||
if(NOT ";${OpenCV_HAL};" MATCHES ";kleidicv;")
|
||||
set(OpenCV_HAL "kleidicv;${OpenCV_HAL}")
|
||||
|
@ -161,3 +161,19 @@ if(WITH_CLP)
|
||||
endif()
|
||||
endif()
|
||||
endif(WITH_CLP)
|
||||
|
||||
# --- ARM KleidiCV
|
||||
if(WITH_KLEIDICV)
|
||||
if(KLEIDICV_SOURCE_PATH AND EXISTS "${KLEIDICV_SOURCE_PATH}/adapters/opencv/CMakeLists.txt")
|
||||
set(HAVE_KLEIDICV ON)
|
||||
endif()
|
||||
if(NOT HAVE_KLEIDICV)
|
||||
include("${OpenCV_SOURCE_DIR}/3rdparty/kleidicv/kleidicv.cmake")
|
||||
download_kleidicv(KLEIDICV_SOURCE_PATH)
|
||||
if(KLEIDICV_SOURCE_PATH)
|
||||
set(HAVE_KLEIDICV ON)
|
||||
endif()
|
||||
else()
|
||||
set(HAVE_KLEIDICV OFF)
|
||||
endif()
|
||||
endif(WITH_KLEIDICV)
|
||||
|
@ -613,7 +613,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
|
||||
|
||||
typedef int (*ScalarFunc)(const uchar* src, size_t step_src,
|
||||
uchar* dst, size_t step_dst, int width, int height,
|
||||
void* scalar, bool scalarIsFirst);
|
||||
void* scalar, bool scalarIsFirst, int nChannels);
|
||||
|
||||
typedef int (*ExtendedTypeFunc)(const uchar* src1, size_t step1,
|
||||
const uchar* src2, size_t step2,
|
||||
@ -887,7 +887,6 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
|
||||
for( size_t j = 0; j < total; j += blocksize )
|
||||
{
|
||||
int bsz = (int)MIN(total - j, blocksize);
|
||||
Size bszn(bsz*cn, 1);
|
||||
const uchar *sptr1 = ptrs[0];
|
||||
const uchar* sptr2 = buf2;
|
||||
uchar* dptr = ptrs[1];
|
||||
@ -900,17 +899,17 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
|
||||
// try to perform operation in 1 call, fallback to classic way if fail
|
||||
uchar* opconverted = haveMask ? maskbuf : dptr;
|
||||
if (!scalarFunc || src2.total() != 1 ||
|
||||
scalarFunc(extSptr1, 1, opconverted, 1, bszn.width, bszn.height, (void*)extSptr2, swapped12) != 0)
|
||||
scalarFunc(extSptr1, 1, opconverted, 1, bsz, 1, (void*)extSptr2, swapped12, cn) != 0)
|
||||
{
|
||||
// try to perform operation with conversion in one call
|
||||
// if fail, use converter functions
|
||||
|
||||
if (!extendedFunc || extendedFunc(extSptr1, 1, extSptr2, 1, opconverted, 1,
|
||||
bszn.width, bszn.height, usrdata) != 0)
|
||||
bsz*cn, 1, usrdata) != 0)
|
||||
{
|
||||
if( cvtsrc1 )
|
||||
{
|
||||
cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
|
||||
cvtsrc1( sptr1, 1, 0, 1, buf1, 1, Size(bsz*cn, 1), 0 );
|
||||
sptr1 = buf1;
|
||||
}
|
||||
|
||||
@ -918,12 +917,12 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
|
||||
std::swap(sptr1, sptr2);
|
||||
|
||||
uchar* fdst = ( haveMask || cvtdst ) ? wbuf : dptr;
|
||||
func( sptr1, 1, sptr2, 1, fdst, 1, bszn.width, bszn.height, usrdata );
|
||||
func( sptr1, 1, sptr2, 1, fdst, 1, bsz*cn, 1, usrdata );
|
||||
|
||||
if (cvtdst)
|
||||
{
|
||||
uchar* cdst = haveMask ? maskbuf : dptr;
|
||||
cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0);
|
||||
cvtdst(wbuf, 1, 0, 1, cdst, 1, Size(bsz*cn, 1), 0);
|
||||
}
|
||||
opconverted = cvtdst ? maskbuf : wbuf;
|
||||
}
|
||||
@ -965,9 +964,9 @@ static BinaryFuncC* getAddTab()
|
||||
}
|
||||
|
||||
static int addScalar32f32fWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
|
||||
void* scalar, bool /*scalarIsFirst*/)
|
||||
void* scalar, bool /*scalarIsFirst*/, int nChannels)
|
||||
{
|
||||
int res = cv_hal_addScalar32f32f((const float*)src, step_src, (float *)dst, step_dst, width, height, (const float*)scalar);
|
||||
int res = cv_hal_addScalar32f32f((const float*)src, step_src, (float *)dst, step_dst, width, height, (const float*)scalar, nChannels);
|
||||
if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
|
||||
return res;
|
||||
else
|
||||
@ -978,9 +977,9 @@ static int addScalar32f32fWrapper(const uchar* src, size_t step_src, uchar* dst,
|
||||
}
|
||||
|
||||
static int addScalar16s16sWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
|
||||
void* scalar, bool /*scalarIsFirst*/)
|
||||
void* scalar, bool /*scalarIsFirst*/, int nChannels)
|
||||
{
|
||||
int res = cv_hal_addScalar16s16s((const int16_t*)src, step_src, (int16_t *)dst, step_dst, width, height, (const int16_t*)scalar);
|
||||
int res = cv_hal_addScalar16s16s((const int16_t*)src, step_src, (int16_t *)dst, step_dst, width, height, (const int16_t*)scalar, nChannels);
|
||||
if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
|
||||
return res;
|
||||
else
|
||||
@ -1094,6 +1093,67 @@ static BinaryFuncC* getAbsDiffTab()
|
||||
return absDiffTab;
|
||||
}
|
||||
|
||||
|
||||
static int absDiffScalar32f32fWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
|
||||
void* scalar, bool /*scalarIsFirst*/, int nChannels)
|
||||
{
|
||||
int res = cv_hal_absDiffScalar32f32f((const float*)src, step_src, (float *)dst, step_dst, width, height, (const float*)scalar, nChannels);
|
||||
if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
|
||||
return res;
|
||||
else
|
||||
{
|
||||
CV_Error_(cv::Error::StsInternal, ("HAL implementation addScalar32f32f ==> " CVAUX_STR(cv_hal_addScalar32f32f)
|
||||
" returned %d (0x%08x)", res, res));
|
||||
}
|
||||
}
|
||||
|
||||
static int absDiffScalar32s32uWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
|
||||
void* scalar, bool /*scalarIsFirst*/, int nChannels)
|
||||
{
|
||||
int res = cv_hal_absDiffScalar32s32u((const int*)src, step_src, (uint32_t*)dst, step_dst, width, height, (const int*)scalar, nChannels);
|
||||
if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
|
||||
return res;
|
||||
else
|
||||
{
|
||||
CV_Error_(cv::Error::StsInternal, ("HAL implementation addScalar32f32f ==> " CVAUX_STR(cv_hal_addScalar32f32f)
|
||||
" returned %d (0x%08x)", res, res));
|
||||
}
|
||||
}
|
||||
|
||||
static int absDiffScalar8u8uWrapper(const uchar* src, size_t step_src, uchar* dst, size_t step_dst, int width, int height,
|
||||
void* scalar, bool /*scalarIsFirst*/, int nChannels)
|
||||
{
|
||||
int res = cv_hal_absDiffScalar8u8u((const uchar*)src, step_src, (uchar*)dst, step_dst, width, height, (const uchar*)scalar, nChannels);
|
||||
if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
|
||||
return res;
|
||||
else
|
||||
{
|
||||
CV_Error_(cv::Error::StsInternal, ("HAL implementation addScalar32f32f ==> " CVAUX_STR(cv_hal_addScalar32f32f)
|
||||
" returned %d (0x%08x)", res, res));
|
||||
}
|
||||
}
|
||||
|
||||
static ScalarFunc getAbsDiffScalarFunc(int srcType, int dstType)
|
||||
{
|
||||
if (srcType == CV_32F && dstType == CV_32F)
|
||||
{
|
||||
return absDiffScalar32f32fWrapper;
|
||||
}
|
||||
// resulting type is 32U in fact
|
||||
else if (srcType == CV_32S && dstType == CV_32S)
|
||||
{
|
||||
return absDiffScalar32s32uWrapper;
|
||||
}
|
||||
else if (srcType == CV_8U && dstType == CV_8U)
|
||||
{
|
||||
return absDiffScalar8u8uWrapper;
|
||||
}
|
||||
else
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void cv::add( InputArray src1, InputArray src2, OutputArray dst,
|
||||
@ -1108,7 +1168,17 @@ void cv::add( InputArray src1, InputArray src2, OutputArray dst,
|
||||
return;
|
||||
}
|
||||
|
||||
ScalarFunc scalarFunc = getAddScalarFunc(src1.depth(), dtype < 0 ? dst.depth() : dtype);
|
||||
int sdepth = src1.depth();
|
||||
if (checkScalar(src1, src1.type(), src1.kind(), _InputArray::MATX))
|
||||
{
|
||||
sdepth = src2.depth();
|
||||
}
|
||||
if (checkScalar(src2, src2.type(), src2.kind(), _InputArray::MATX))
|
||||
{
|
||||
sdepth = src1.depth();
|
||||
}
|
||||
|
||||
ScalarFunc scalarFunc = getAddScalarFunc(sdepth, dtype < 0 ? dst.depth() : dtype);
|
||||
arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD, nullptr,
|
||||
/* scalarFunc */ scalarFunc );
|
||||
}
|
||||
@ -1141,7 +1211,18 @@ void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
|
||||
return;
|
||||
}
|
||||
|
||||
arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
|
||||
int sdepth = src1.depth();
|
||||
if (checkScalar(src1, src1.type(), src1.kind(), _InputArray::MATX))
|
||||
{
|
||||
sdepth = src2.depth();
|
||||
}
|
||||
if (checkScalar(src2, src2.type(), src2.kind(), _InputArray::MATX))
|
||||
{
|
||||
sdepth = src1.depth();
|
||||
}
|
||||
ScalarFunc scalarFunc = getAbsDiffScalarFunc(sdepth, dst.depth());
|
||||
arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF,
|
||||
/* extendedFunc */ nullptr, scalarFunc);
|
||||
}
|
||||
|
||||
void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask)
|
||||
|
@ -119,9 +119,10 @@ Add scalar: _dst[i] = src[i] + scalar
|
||||
@param width width of the images
|
||||
@param height height of the images
|
||||
@param scalar_data pointer to scalar value
|
||||
@param nChannels number of channels per element
|
||||
*/
|
||||
inline int hal_ni_addScalar32f32f(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, int width, int height, const float* scalar_data) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_addScalar16s16s(const int16_t *src_data, size_t src_step, int16_t *dst_data, size_t dst_step, int width, int height, const int16_t* scalar_data) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_addScalar32f32f(const float* src_data, size_t src_step, float* dst_data, size_t dst_step, int width, int height, const float* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_addScalar16s16s(const int16_t* src_data, size_t src_step, int16_t* dst_data, size_t dst_step, int width, int height, const int16_t* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
//! @}
|
||||
|
||||
/**
|
||||
@ -190,6 +191,23 @@ inline int hal_ni_absdiff64u(const uint64 *src1_data, size_t src1_step, const ui
|
||||
inline int hal_ni_absdiff64s(const int64 *src1_data, size_t src1_step, const int64 *src2_data, size_t src2_step, int64 *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_absdiff16f(const cv_hal_f16 *src1_data, size_t src1_step, const cv_hal_f16 *src2_data, size_t src2_step, cv_hal_f16 *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_absdiff16bf(const cv_hal_bf16 *src1_data, size_t src1_step, const cv_hal_bf16 *src2_data, size_t src2_step, cv_hal_bf16 *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
|
||||
/*
|
||||
Absolute difference with scalar: _dst[i] = | src[i] - scalar |_
|
||||
|
||||
@param src_data source image data
|
||||
@param src_step source image step
|
||||
@param dst_data destination image data
|
||||
@param dst_step destination image step
|
||||
@param width width of the images
|
||||
@param height height of the images
|
||||
@param scalar_data pointer to scalar value
|
||||
@param nChannels number of channels per element
|
||||
*/
|
||||
inline int hal_ni_absDiffScalar32f32f(const float* src_data, size_t src_step, float* dst_data, size_t dst_step, int width, int height, const float* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_absDiffScalar32s32u(const int* src_data, size_t src_step, uint32_t* dst_data, size_t dst_step, int width, int height, const int* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
inline int hal_ni_absDiffScalar8u8u (const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, const uchar* scalar_data, int nChannels) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||
|
||||
//! @}
|
||||
|
||||
/**
|
||||
@ -279,6 +297,9 @@ inline int hal_ni_not8u(const uchar *src_data, size_t src_step, uchar *dst_data,
|
||||
#define cv_hal_absdiff64f hal_ni_absdiff64f
|
||||
#define cv_hal_absdiff16f hal_ni_absdiff16f
|
||||
#define cv_hal_absdiff16bf hal_ni_absdiff16bf
|
||||
#define cv_hal_absDiffScalar32f32f hal_ni_absDiffScalar32f32f
|
||||
#define cv_hal_absDiffScalar32s32u hal_ni_absDiffScalar32s32u
|
||||
#define cv_hal_absDiffScalar8u8u hal_ni_absDiffScalar8u8u
|
||||
#define cv_hal_and8u hal_ni_and8u
|
||||
#define cv_hal_or8u hal_ni_or8u
|
||||
#define cv_hal_xor8u hal_ni_xor8u
|
||||
|
@ -8,7 +8,7 @@ ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV
|
||||
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX RVV LASX)
|
||||
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2 NEON NEON_FP16)
|
||||
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)
|
||||
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON_FP16)
|
||||
ocv_add_dispatched_file("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON NEON_FP16)
|
||||
ocv_add_dispatched_file_force_all("layers/cpu_kernels/fast_gemm_kernels" AVX AVX2 NEON LASX)
|
||||
|
||||
ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)
|
||||
|
@ -12,28 +12,21 @@
|
||||
#include "../../precomp.hpp"
|
||||
#include "convolution.hpp"
|
||||
|
||||
#include "conv_winograd_f63.simd.hpp"
|
||||
#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
|
||||
|
||||
namespace cv { namespace dnn {
|
||||
|
||||
#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2
|
||||
enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
|
||||
|
||||
void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
|
||||
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
|
||||
|
||||
/*Input transform*/
|
||||
void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
|
||||
float* outptr, int Cg, const int winoIblock, const int winoAtomF32);
|
||||
|
||||
/*Output transform*/
|
||||
void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
|
||||
float bias, float minval, float maxval, bool ifMinMaxAct);
|
||||
|
||||
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
|
||||
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
|
||||
{
|
||||
const cv::dnn::Winofunc func =
|
||||
conv->useFP16 ? cv::dnn::getWinofunc_F16()
|
||||
: (conv->useAVX || conv->useAVX2 || conv->useNEON || conv->useRVV || conv->useSIMD128) ? cv::dnn::getWinofunc_F32()
|
||||
: cv::dnn::Winofunc::empty();
|
||||
|
||||
if (!func.isGood())
|
||||
return 0;
|
||||
|
||||
Mat input = _input.getMat();
|
||||
Mat output = _output.getMat();
|
||||
Mat fusedAddMat = _fusedAddMat.getMat();
|
||||
@ -52,42 +45,10 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
|
||||
int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups;
|
||||
|
||||
const int CONV_WINO_KBLOCK = 4;
|
||||
#if (CV_NEON && CV_NEON_AARCH64)
|
||||
const int CONV_WINO_IBLOCK = 6;
|
||||
#elif CV_TRY_AVX || CV_TRY_AVX2
|
||||
const int CONV_WINO_IBLOCK = (conv->useAVX || conv->useAVX2) ? 6 : 3;
|
||||
#else
|
||||
const int CONV_WINO_IBLOCK = 3;
|
||||
#endif
|
||||
|
||||
#if CV_TRY_AVX || CV_TRY_AVX2
|
||||
const int CONV_WINO_ATOM_F32 = (conv->useAVX || conv->useAVX2) ? 8 : 4;
|
||||
#else
|
||||
const int CONV_WINO_ATOM_F32 = 4;
|
||||
#endif
|
||||
const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
|
||||
|
||||
int CONV_WINO_ATOM = CONV_WINO_ATOM_F32;
|
||||
int CONV_WINO_NATOMS = CONV_WINO_NATOMS_F32;
|
||||
|
||||
#ifdef CONV_ARM_FP16
|
||||
// FP 16
|
||||
const int CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2;
|
||||
const int CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16;
|
||||
#endif
|
||||
|
||||
int esz = sizeof(float );
|
||||
|
||||
#ifdef CONV_ARM_FP16
|
||||
const bool useFP16 = conv->useFP16;
|
||||
if (useFP16)
|
||||
{
|
||||
// works at FP 16.
|
||||
CONV_WINO_ATOM = CONV_WINO_ATOM_F16;
|
||||
CONV_WINO_NATOMS = CONV_WINO_NATOMS_F16;
|
||||
esz = sizeof(__fp16);
|
||||
}
|
||||
#endif
|
||||
const int CONV_WINO_IBLOCK = func.iblock;
|
||||
const int CONV_WINO_ATOM = func.natom;
|
||||
const int CONV_WINO_NATOMS = CONV_WINO_AREA / CONV_WINO_ATOM;
|
||||
const int esz = func.esz;
|
||||
|
||||
int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
|
||||
const size_t inp_planesize = (size_t)Hi*Wi;
|
||||
@ -175,35 +136,7 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
|
||||
inptr = inpbuf;
|
||||
inpstep = CONV_WINO_SIZE;
|
||||
}
|
||||
|
||||
#if CV_TRY_AVX2
|
||||
if (conv->useAVX2)
|
||||
opt_AVX2::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_AVX
|
||||
if (conv->useAVX)
|
||||
opt_AVX::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
|
||||
else
|
||||
#endif
|
||||
#if CV_NEON && CV_NEON_AARCH64
|
||||
if (conv->useNEON)
|
||||
{
|
||||
#ifdef CONV_ARM_FP16
|
||||
if (useFP16)
|
||||
{
|
||||
opt_NEON_FP16::winofunc_BtXB_8x8_F16(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK,
|
||||
CONV_WINO_ATOM);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
opt_NEON::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK,
|
||||
CONV_WINO_ATOM);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
|
||||
|
||||
func.BtXB_8x8(inptr, inpstep, (uchar*)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -219,18 +152,20 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
|
||||
// apply inverse Winograd transforms to the sums,
|
||||
// add bias, apply activation function if any and store the results.
|
||||
char* wptr0 = nullptr;
|
||||
#ifdef CONV_ARM_FP16
|
||||
if (useFP16)
|
||||
if (esz == 2)
|
||||
{
|
||||
CV_Assert(!conv->weightsWinoBuf_FP16.empty());
|
||||
wptr0 = (char *)conv->getWeightsWinoFP16();
|
||||
}
|
||||
else
|
||||
#endif
|
||||
else if (esz == 4)
|
||||
{
|
||||
CV_Assert(!conv->weightsWinoBuf.empty());
|
||||
wptr0 = (char *)conv->getWeightsWino();
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Error(Error::StsError, "Impossible configuration");
|
||||
}
|
||||
|
||||
parallel_for_(Range(0, ntasks), [&](const Range& r0) {
|
||||
for (int task_id = r0.start; task_id < r0.end; task_id++)
|
||||
@ -271,36 +206,9 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
|
||||
char* inwptr = wbuf_all + inwofs * esz;
|
||||
char* wptr = wptr0 + wofs * esz;
|
||||
|
||||
#if CV_TRY_AVX2
|
||||
if (conv->useAVX2)
|
||||
opt_AVX2::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
|
||||
CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_AVX
|
||||
if (conv->useAVX)
|
||||
opt_AVX::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
|
||||
CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
|
||||
else
|
||||
#endif
|
||||
#if CV_NEON && CV_NEON_AARCH64
|
||||
if (conv->useNEON)
|
||||
{
|
||||
#ifdef CONV_ARM_FP16
|
||||
if (useFP16)
|
||||
{
|
||||
opt_NEON_FP16::winofunc_accum_F16(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
|
||||
CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
opt_NEON::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
|
||||
CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
|
||||
CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
|
||||
func.accum((uchar*)inwptr, (uchar*)wptr, (uchar*)out_wbuf, Cg,
|
||||
block_id1 - block_id0, CONV_WINO_IBLOCK,
|
||||
CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
|
||||
|
||||
for (int k = k0; k < k1; k++)
|
||||
{
|
||||
@ -336,37 +244,10 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
|
||||
dx1*sizeof(pbptr0[0]));
|
||||
}
|
||||
}
|
||||
#if CV_TRY_AVX2
|
||||
if (conv->useAVX2)
|
||||
opt_AVX2::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
|
||||
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
|
||||
else
|
||||
#endif
|
||||
#if CV_TRY_AVX
|
||||
if (conv->useAVX)
|
||||
opt_AVX::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
|
||||
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
|
||||
else
|
||||
#endif
|
||||
#if CV_NEON && CV_NEON_AARCH64
|
||||
// NEON optimization is only for ARMv8 device, and for ARMv7 device, we use the Universal intrinsics.
|
||||
if (conv->useNEON)
|
||||
{
|
||||
#ifdef CONV_ARM_FP16
|
||||
if (useFP16)
|
||||
{
|
||||
opt_NEON_FP16::winofunc_AtXA_8x8_F16(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA * esz, CONV_WINO_SIZE,
|
||||
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
opt_NEON::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
|
||||
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
|
||||
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
|
||||
|
||||
const int count = ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA;
|
||||
func.AtXA_8x8((uchar*)out_wbuf + count * esz, CONV_WINO_SIZE,
|
||||
bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
|
||||
|
||||
if (partial)
|
||||
{
|
||||
@ -383,441 +264,4 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
|
||||
return 1;
|
||||
}
|
||||
|
||||
/****************************************************************************************\
|
||||
SIMD for winograd function
|
||||
\****************************************************************************************/
|
||||
|
||||
#if CV_SIMD128
|
||||
|
||||
void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
|
||||
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
|
||||
{
|
||||
#if 1
|
||||
CV_Assert(winoIblock == 3 && winoKblock == 4 && winoAtomF32 == 4);
|
||||
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
|
||||
outbuf += winoAtomF32)
|
||||
{
|
||||
v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
|
||||
v_float32x4 s10 = v_setzero_f32(), s11 = s00, s12 = s00;
|
||||
v_float32x4 s20 = v_setzero_f32(), s21 = s00, s22 = s00;
|
||||
v_float32x4 s30 = v_setzero_f32(), s31 = s00, s32 = s00;
|
||||
|
||||
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
|
||||
wptr += winoKblock*winoAtomF32)
|
||||
{
|
||||
v_float32x4 x0, x1, x2;
|
||||
x0 = v_load(inwptr);
|
||||
x1 = v_load(inwptr + 4);
|
||||
x2 = v_load(inwptr + 8);
|
||||
|
||||
v_float32x4 w0 = v_load(wptr);
|
||||
s00 = v_fma(w0, x0, s00);
|
||||
s01 = v_fma(w0, x1, s01);
|
||||
s02 = v_fma(w0, x2, s02);
|
||||
|
||||
w0 = v_load(wptr + 4);
|
||||
s10 = v_fma(w0, x0, s10);
|
||||
s11 = v_fma(w0, x1, s11);
|
||||
s12 = v_fma(w0, x2, s12);
|
||||
|
||||
w0 = v_load(wptr + 8);
|
||||
s20 = v_fma(w0, x0, s20);
|
||||
s21 = v_fma(w0, x1, s21);
|
||||
s22 = v_fma(w0, x2, s22);
|
||||
|
||||
w0 = v_load(wptr + 12);
|
||||
s30 = v_fma(w0, x0, s30);
|
||||
s31 = v_fma(w0, x1, s31);
|
||||
s32 = v_fma(w0, x2, s32);
|
||||
}
|
||||
|
||||
v_store(outbuf, s00);
|
||||
v_store(outbuf + 1*64, s01);
|
||||
v_store(outbuf + 2*64, s02);
|
||||
v_store(outbuf + 3*64, s10);
|
||||
v_store(outbuf + 4*64, s11);
|
||||
v_store(outbuf + 5*64, s12);
|
||||
v_store(outbuf + 6*64, s20);
|
||||
v_store(outbuf + 7*64, s21);
|
||||
v_store(outbuf + 8*64, s22);
|
||||
v_store(outbuf + 9*64, s30);
|
||||
v_store(outbuf + 10*64, s31);
|
||||
v_store(outbuf + 11*64, s32);
|
||||
}
|
||||
#else
|
||||
// Naive C++ code, the code should never be run here.
|
||||
for (int atom_id = 0; atom_id < winoNatomF32;
|
||||
atom_id++, outbuf += winoAtomF32)
|
||||
{
|
||||
float sumbuf[winoIblock*winoKblock*winoAtomF32];
|
||||
memset(sumbuf, 0, sizeof(sumbuf));
|
||||
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
|
||||
wptr += winoKblock*winoAtomF32)
|
||||
{
|
||||
for (int i = 0; i < winoKblock; i++)
|
||||
{
|
||||
for (int j = 0; j < winoIblock; j++)
|
||||
{
|
||||
int i_ = i*winoAtomF32;
|
||||
int j_ = j*winoAtomF32;
|
||||
int ij_ = i_*winoIblock + j_;
|
||||
float s0 = inwptr[j_ + 0]*wptr[i_ + 0];
|
||||
float s1 = inwptr[j_ + 1]*wptr[i_ + 1];
|
||||
float s2 = inwptr[j_ + 2]*wptr[i_ + 2];
|
||||
float s3 = inwptr[j_ + 3]*wptr[i_ + 3];
|
||||
sumbuf[ij_ + 0] += s0;
|
||||
sumbuf[ij_ + 1] += s1;
|
||||
sumbuf[ij_ + 2] += s2;
|
||||
sumbuf[ij_ + 3] += s3;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int ij = 0; ij < winoKblock*winoIblock; ij++)
|
||||
{
|
||||
int ij_ = ij*winoAtomF32;
|
||||
int ij_out = ij*CONV_WINO_AREA;
|
||||
outbuf[ij_out + 0] = sumbuf[ij_ + 0];
|
||||
outbuf[ij_out + 1] = sumbuf[ij_ + 1];
|
||||
outbuf[ij_out + 2] = sumbuf[ij_ + 2];
|
||||
outbuf[ij_out + 3] = sumbuf[ij_ + 3];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/*Input transform*/
|
||||
void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
|
||||
float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
|
||||
{
|
||||
CV_Assert(winoIblock == 3 && winoAtomF32 == 4);
|
||||
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
|
||||
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
|
||||
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
|
||||
v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4);
|
||||
v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4);
|
||||
v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4);
|
||||
v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4);
|
||||
v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4);
|
||||
|
||||
v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
|
||||
|
||||
{
|
||||
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
|
||||
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
|
||||
v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11;
|
||||
t00 = v_sub(x40, x20);
|
||||
t01 = v_sub(x41, x21);
|
||||
t10 = v_sub(x30, x50);
|
||||
t11 = v_sub(x31, x51);
|
||||
v_float32x4 y00 = v_fma(t00, q5_25, v_sub(x00, x60));
|
||||
v_float32x4 y01 = v_fma(t01, q5_25, v_sub(x01, x61));
|
||||
v_float32x4 y70 = v_fma(t10, q5_25, v_sub(x70, x10));
|
||||
v_float32x4 y71 = v_fma(t11, q5_25, v_sub(x71, x11));
|
||||
|
||||
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
|
||||
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
|
||||
v_float32x4 qm4_25 = v_setall_f32(-4.25f);
|
||||
t00 = v_fma(x30, qm4_25, v_add(x10, x50));
|
||||
t01 = v_fma(x31, qm4_25, v_add(x11, x51));
|
||||
t10 = v_fma(x40, qm4_25, v_add(x20, x60));
|
||||
t11 = v_fma(x41, qm4_25, v_add(x21, x61));
|
||||
|
||||
v_float32x4 y10 = v_add(t00, t10), y11 = v_add(t01, t11);
|
||||
v_float32x4 y20 = v_sub(t10, t00), y21 = v_sub(t11, t01);
|
||||
|
||||
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
|
||||
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
|
||||
v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f);
|
||||
v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f);
|
||||
t00 = v_fma(x10, q0_5, v_add(x50, x50));
|
||||
t01 = v_fma(x11, q0_5, v_add(x51, x51));
|
||||
t10 = v_fma(x20, q0_25, x60);
|
||||
t11 = v_fma(x21, q0_25, x61);
|
||||
t00 = v_fma(x30, qm2_5, t00);
|
||||
t01 = v_fma(x31, qm2_5, t01);
|
||||
t10 = v_fma(x40, qm1_25, t10);
|
||||
t11 = v_fma(x41, qm1_25, t11);
|
||||
|
||||
v_float32x4 y30 = v_add(t00, t10), y31 = v_add(t01, t11);
|
||||
v_float32x4 y40 = v_sub(t10, t00), y41 = v_sub(t11, t01);
|
||||
|
||||
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
|
||||
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
|
||||
v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f);
|
||||
t00 = v_fma(x50, q0_5, v_add(x10, x10));
|
||||
t01 = v_fma(x51, q0_5, v_add(x11, x11));
|
||||
t10 = v_fma(x20, q4 , x60);
|
||||
t11 = v_fma(x21, q4 , x61);
|
||||
t00 = v_fma(x30, qm2_5, t00);
|
||||
t01 = v_fma(x31, qm2_5, t01);
|
||||
t10 = v_fma(x40, qm5 , t10);
|
||||
t11 = v_fma(x41, qm5 , t11);
|
||||
|
||||
v_float32x4 y50 = v_add(t00, t10), y51 = v_add(t01, t11);
|
||||
v_float32x4 y60 = v_sub(t10, t00), y61 = v_sub(t11, t01);
|
||||
|
||||
/* transpose 8x8 matrix with v_transpose4x4 */
|
||||
|
||||
v_float32x4 y000, y100, y200, y300, y010, y110, y210, y310, y400, y500, y600, y700, y410, y510, y610, y710;
|
||||
v_transpose4x4(y00, y10, y20, y30, y000, y100, y200, y300);
|
||||
v_transpose4x4(y01, y11, y21, y31, y010, y110, y210, y310);
|
||||
v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700);
|
||||
v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710);
|
||||
|
||||
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
|
||||
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
|
||||
t00 = v_sub(y010, y200);
|
||||
t01 = v_sub(y410, y600);
|
||||
t10 = v_sub(y300, y110);
|
||||
t11 = v_sub(y700, y510);
|
||||
z00 = v_fma(t00, q5_25, v_sub(y000, y210));
|
||||
z01 = v_fma(t01, q5_25, v_sub(y400, y610));
|
||||
z70 = v_fma(t10, q5_25, v_sub(y310, y100));
|
||||
z71 = v_fma(t11, q5_25, v_sub(y710, y500));
|
||||
|
||||
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
|
||||
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
|
||||
t00 = v_fma(y300, qm4_25, v_add(y100, y110));
|
||||
t01 = v_fma(y700, qm4_25, v_add(y500, y510));
|
||||
t10 = v_fma(y010, qm4_25, v_add(y200, y210));
|
||||
t11 = v_fma(y410, qm4_25, v_add(y600, y610));
|
||||
|
||||
z10 = v_add(t00, t10); z11 = v_add(t01, t11);
|
||||
z20 = v_sub(t10, t00); z21 = v_sub(t11, t01);
|
||||
|
||||
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
|
||||
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
|
||||
t00 = v_fma(y100, q0_5, v_add(y110, y110));
|
||||
t01 = v_fma(y500, q0_5, v_add(y510, y510));
|
||||
t10 = v_fma(y200, q0_25, y210);
|
||||
t11 = v_fma(y600, q0_25, y610);
|
||||
t00 = v_fma(y300, qm2_5, t00);
|
||||
t01 = v_fma(y700, qm2_5, t01);
|
||||
t10 = v_fma(y010, qm1_25, t10);
|
||||
t11 = v_fma(y410, qm1_25, t11);
|
||||
|
||||
z30 = v_add(t00, t10); z31 = v_add(t01, t11);
|
||||
z40 = v_sub(t10, t00); z41 = v_sub(t11, t01);
|
||||
|
||||
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
|
||||
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
|
||||
t00 = v_fma(y110, q0_5, v_add(y100, y100));
|
||||
t01 = v_fma(y510, q0_5, v_add(y500, y500));
|
||||
t10 = v_fma(y200, q4, y210);
|
||||
t11 = v_fma(y600, q4, y610);
|
||||
t00 = v_fma(y300, qm2_5, t00);
|
||||
t01 = v_fma(y700, qm2_5, t01);
|
||||
t10 = v_fma(y010, qm5, t10);
|
||||
t11 = v_fma(y410, qm5, t11);
|
||||
|
||||
z50 = v_add(t00, t10); z51 = v_add(t01, t11);
|
||||
z60 = v_sub(t10, t00); z61 = v_sub(t11, t01);
|
||||
}
|
||||
|
||||
const int outstep = winoIblock*winoAtomF32*Cg;
|
||||
|
||||
v_store(outptr, z00);
|
||||
v_store(outptr + outstep, z01);
|
||||
v_store(outptr + outstep*2, z10);
|
||||
v_store(outptr + outstep*3, z11);
|
||||
v_store(outptr + outstep*4, z20);
|
||||
v_store(outptr + outstep*5, z21);
|
||||
v_store(outptr + outstep*6, z30);
|
||||
v_store(outptr + outstep*7, z31);
|
||||
v_store(outptr + outstep*8, z40);
|
||||
v_store(outptr + outstep*9, z41);
|
||||
v_store(outptr + outstep*10, z50);
|
||||
v_store(outptr + outstep*11, z51);
|
||||
v_store(outptr + outstep*12, z60);
|
||||
v_store(outptr + outstep*13, z61);
|
||||
v_store(outptr + outstep*14, z70);
|
||||
v_store(outptr + outstep*15, z71);
|
||||
}
|
||||
|
||||
/*Output transform*/
|
||||
/* Inverse Winograd 8x8 transform:
|
||||
out = (A'*inp*A)', where
|
||||
inp is input 8x8 FP32 matrix,
|
||||
A' is
|
||||
[1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f,
|
||||
0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f,
|
||||
0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f,
|
||||
0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f,
|
||||
0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f,
|
||||
0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f]
|
||||
|
||||
inp is pre-loaded into xij registers,
|
||||
out will be stored in zij, where (0<=i<=7 for x, 0<=i<=5 for z), 0<=j<=1.
|
||||
|
||||
After the inverse transform is done, we add bias,
|
||||
optionally add results from the earlier tensors (by-pass),
|
||||
optionally apply activation function and then
|
||||
store the final results.
|
||||
|
||||
That is, after both forward and then inverse transformation,
|
||||
we get non-transposed result.
|
||||
Of course, for the correct work of Winograd-based convolution,
|
||||
the Winograd-transformed weights should also be transposed.
|
||||
init_conv() (see OpConv.fx) takes care of that.
|
||||
*/
|
||||
void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
|
||||
float* bpptr, int bpstep, float* outptr, int outstep,
|
||||
float bias, float minval, float maxval, bool ifMinMaxAct)
|
||||
{
|
||||
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
|
||||
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
|
||||
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
|
||||
v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4);
|
||||
v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4);
|
||||
v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4);
|
||||
v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4);
|
||||
v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4);
|
||||
v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
|
||||
|
||||
{
|
||||
v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
|
||||
s12_0 = v_add(x10, x20); s12_1 = v_add(x11, x21);
|
||||
s34_0 = v_add(x30, x40); s34_1 = v_add(x31, x41);
|
||||
s56_0 = v_add(x50, x60); s56_1 = v_add(x51, x61);
|
||||
|
||||
v_float32x4 y00 = v_add(v_add(v_add(x00, s12_0), s34_0), s56_0);
|
||||
v_float32x4 y01 = v_add(v_add(v_add(x01, s12_1), s34_1), s56_1);
|
||||
|
||||
v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
|
||||
v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
v_float32x4 y21 = v_fma(s56_1, a0 ,v_fma(s34_1, a1, s12_1) );
|
||||
|
||||
a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f);
|
||||
v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
|
||||
|
||||
s12_0 = v_sub(x10, x20); s12_1 = v_sub(x11, x21);
|
||||
s34_0 = v_sub(x30, x40); s34_1 = v_sub(x31, x41);
|
||||
s56_0 = v_sub(x50, x60); s56_1 = v_sub(x51, x61);
|
||||
|
||||
a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f);
|
||||
v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(x70, s12_0)));
|
||||
v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(x71, s12_1)));
|
||||
|
||||
a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f);
|
||||
v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
v_float32x4 y11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
|
||||
|
||||
a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.f);
|
||||
v_float32x4 y30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
v_float32x4 y31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
|
||||
|
||||
v_float32x4 y60 = v_setall_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
|
||||
|
||||
/* transpose 8x8 matrix with v_transpose4x4 */
|
||||
|
||||
v_float32x4 y000, y100, y200, y300, y010, y110, y210, y310, y400, y500, y600, y700, y410, y510, y610, y710;
|
||||
v_transpose4x4(y00, y10, y20, y30, y000, y100, y200, y300);
|
||||
v_transpose4x4(y01, y11, y21, y31, y010, y110, y210, y310);
|
||||
v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700);
|
||||
v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710);
|
||||
|
||||
s12_0 = v_add(y100, y200); s12_1 = v_add(y500, y600);
|
||||
s34_0 = v_add(y300, y010); s34_1 = v_add(y700, y410);
|
||||
s56_0 = v_add(y110, y210); s56_1 = v_add(y510, y610);
|
||||
|
||||
z00 = v_add(v_add(v_add(y000, s12_0), s34_0), s56_0);
|
||||
z01 = v_add(v_add(v_add(y400, s12_1), s34_1), s56_1);
|
||||
|
||||
a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
|
||||
z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
z21 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
|
||||
|
||||
a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f);
|
||||
z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
|
||||
|
||||
s12_0 = v_sub(y100, y200); s12_1 = v_sub(y500, y600);
|
||||
s34_0 = v_sub(y300, y010); s34_1 = v_sub(y700, y410);
|
||||
s56_0 = v_sub(y110, y210); s56_1 = v_sub(y510, y610);
|
||||
|
||||
a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f);
|
||||
z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(y310, s12_0)));
|
||||
z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(y710, s12_1)));
|
||||
a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f);
|
||||
z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
|
||||
|
||||
a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.0f);
|
||||
z30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
|
||||
z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
|
||||
|
||||
v_float32x4 vbias = v_setall_f32(bias);
|
||||
z00 = v_add(z00, vbias);
|
||||
z01 = v_add(z01, vbias);
|
||||
z10 = v_add(z10, vbias);
|
||||
z11 = v_add(z11, vbias);
|
||||
z20 = v_add(z20, vbias);
|
||||
z21 = v_add(z21, vbias);
|
||||
z30 = v_add(z30, vbias);
|
||||
z31 = v_add(z31, vbias);
|
||||
z40 = v_add(z40, vbias);
|
||||
z41 = v_add(z41, vbias);
|
||||
z50 = v_add(z50, vbias);
|
||||
z51 = v_add(z51, vbias);
|
||||
}
|
||||
|
||||
if (bpptr)
|
||||
{
|
||||
z00 = v_add(z00, v_load(bpptr));
|
||||
z01 = v_add(z01, v_load_low(bpptr + 4));
|
||||
z10 = v_add(z10, v_load(bpptr + bpstep));
|
||||
z11 = v_add(z11, v_load_low(bpptr + bpstep + 4));
|
||||
z20 = v_add(z20, v_load(bpptr + bpstep * 2));
|
||||
z21 = v_add(z21, v_load_low(bpptr + bpstep * 2 + 4));
|
||||
z30 = v_add(z30, v_load(bpptr + bpstep * 3));
|
||||
z31 = v_add(z31, v_load_low(bpptr + bpstep * 3 + 4));
|
||||
z40 = v_add(z40, v_load(bpptr + bpstep * 4));
|
||||
z41 = v_add(z41, v_load_low(bpptr + bpstep * 4 + 4));
|
||||
z50 = v_add(z50, v_load(bpptr + bpstep * 5));
|
||||
z51 = v_add(z51, v_load_low(bpptr + bpstep * 5 + 4));
|
||||
}
|
||||
|
||||
if (ifMinMaxAct)
|
||||
{
|
||||
v_float32x4 vmax = v_setall_f32(maxval);
|
||||
v_float32x4 vmin = v_setall_f32(minval);
|
||||
|
||||
z00 = v_min(v_max(z00, vmin), vmax);
|
||||
z01 = v_min(v_max(z01, vmin), vmax);
|
||||
z10 = v_min(v_max(z10, vmin), vmax);
|
||||
z11 = v_min(v_max(z11, vmin), vmax);
|
||||
z20 = v_min(v_max(z20, vmin), vmax);
|
||||
z21 = v_min(v_max(z21, vmin), vmax);
|
||||
z30 = v_min(v_max(z30, vmin), vmax);
|
||||
z31 = v_min(v_max(z31, vmin), vmax);
|
||||
z40 = v_min(v_max(z40, vmin), vmax);
|
||||
z41 = v_min(v_max(z41, vmin), vmax);
|
||||
z50 = v_min(v_max(z50, vmin), vmax);
|
||||
z51 = v_min(v_max(z51, vmin), vmax);
|
||||
}
|
||||
|
||||
v_store(outptr, z00);
|
||||
v_store_low(outptr + 4, z01);
|
||||
v_store(outptr + outstep, z10);
|
||||
v_store_low(outptr + outstep + 4, z11);
|
||||
v_store(outptr + outstep*2, z20);
|
||||
v_store_low(outptr + outstep*2 + 4, z21);
|
||||
v_store(outptr + outstep*3, z30);
|
||||
v_store_low(outptr + outstep*3 + 4, z31);
|
||||
v_store(outptr + outstep*4, z40);
|
||||
v_store_low(outptr + outstep*4 + 4, z41);
|
||||
v_store(outptr + outstep*5, z50);
|
||||
v_store_low(outptr + outstep*5 + 4, z51);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
|
||||
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
}} // namespace cv::dnn
|
||||
|
@ -0,0 +1,22 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "convolution.hpp"
|
||||
#include "conv_winograd_f63.simd.hpp"
|
||||
#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp"
|
||||
|
||||
namespace cv {
|
||||
namespace dnn {
|
||||
|
||||
cv::dnn::Winofunc getWinofunc_F32()
|
||||
{
|
||||
CV_CPU_DISPATCH(getWinofunc_F32, (), CV_CPU_DISPATCH_MODES_ALL);
|
||||
}
|
||||
|
||||
cv::dnn::Winofunc getWinofunc_F16()
|
||||
{
|
||||
CV_CPU_DISPATCH(getWinofunc_F16, (), CV_CPU_DISPATCH_MODES_ALL);
|
||||
}
|
||||
|
||||
}} // namespace cv::dnn::
|
@ -1,476 +0,0 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "../../precomp.hpp"
|
||||
#include "convolution.hpp"
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
|
||||
namespace cv {
|
||||
namespace dnn {
|
||||
|
||||
// NEON code work around.
|
||||
namespace opt_NEON
|
||||
{
|
||||
|
||||
#if CV_NEON && CV_NEON_AARCH64
|
||||
|
||||
/* Accumulate */
|
||||
void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
|
||||
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
|
||||
{
|
||||
CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 4);
|
||||
if (iblock > 3)
|
||||
{
|
||||
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
|
||||
outbuf += winoAtomF32)
|
||||
{
|
||||
float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
|
||||
float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
|
||||
float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
|
||||
float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
|
||||
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
|
||||
wptr += winoKblock*winoAtomF32) {
|
||||
float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
|
||||
float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
|
||||
float32x4_t x0, x1;
|
||||
x0 = vld1q_f32(inwptr);
|
||||
x1 = vld1q_f32(inwptr + 4);
|
||||
s00 = vfmaq_f32(s00, w0, x0);
|
||||
s01 = vfmaq_f32(s01, w0, x1);
|
||||
s10 = vfmaq_f32(s10, w1, x0);
|
||||
s11 = vfmaq_f32(s11, w1, x1);
|
||||
s20 = vfmaq_f32(s20, w2, x0);
|
||||
s21 = vfmaq_f32(s21, w2, x1);
|
||||
s30 = vfmaq_f32(s30, w3, x0);
|
||||
s31 = vfmaq_f32(s31, w3, x1);
|
||||
x0 = vld1q_f32(inwptr + 8);
|
||||
x1 = vld1q_f32(inwptr + 12);
|
||||
s02 = vfmaq_f32(s02, w0, x0);
|
||||
s03 = vfmaq_f32(s03, w0, x1);
|
||||
s12 = vfmaq_f32(s12, w1, x0);
|
||||
s13 = vfmaq_f32(s13, w1, x1);
|
||||
s22 = vfmaq_f32(s22, w2, x0);
|
||||
s23 = vfmaq_f32(s23, w2, x1);
|
||||
s32 = vfmaq_f32(s32, w3, x0);
|
||||
s33 = vfmaq_f32(s33, w3, x1);
|
||||
x0 = vld1q_f32(inwptr + 16);
|
||||
x1 = vld1q_f32(inwptr + 20);
|
||||
s04 = vfmaq_f32(s04, w0, x0);
|
||||
s05 = vfmaq_f32(s05, w0, x1);
|
||||
s14 = vfmaq_f32(s14, w1, x0);
|
||||
s15 = vfmaq_f32(s15, w1, x1);
|
||||
s24 = vfmaq_f32(s24, w2, x0);
|
||||
s25 = vfmaq_f32(s25, w2, x1);
|
||||
s34 = vfmaq_f32(s34, w3, x0);
|
||||
s35 = vfmaq_f32(s35, w3, x1);
|
||||
}
|
||||
|
||||
vst1q_f32(outbuf, s00);
|
||||
vst1q_f32(outbuf + 1*64, s01);
|
||||
vst1q_f32(outbuf + 2*64, s02);
|
||||
vst1q_f32(outbuf + 3*64, s03);
|
||||
vst1q_f32(outbuf + 4*64, s04);
|
||||
vst1q_f32(outbuf + 5*64, s05);
|
||||
|
||||
vst1q_f32(outbuf + 6*64, s10);
|
||||
vst1q_f32(outbuf + 7*64, s11);
|
||||
vst1q_f32(outbuf + 8*64, s12);
|
||||
vst1q_f32(outbuf + 9*64, s13);
|
||||
vst1q_f32(outbuf + 10*64, s14);
|
||||
vst1q_f32(outbuf + 11*64, s15);
|
||||
|
||||
vst1q_f32(outbuf + 12*64, s20);
|
||||
vst1q_f32(outbuf + 13*64, s21);
|
||||
vst1q_f32(outbuf + 14*64, s22);
|
||||
vst1q_f32(outbuf + 15*64, s23);
|
||||
vst1q_f32(outbuf + 16*64, s24);
|
||||
vst1q_f32(outbuf + 17*64, s25);
|
||||
|
||||
vst1q_f32(outbuf + 18*64, s30);
|
||||
vst1q_f32(outbuf + 19*64, s31);
|
||||
vst1q_f32(outbuf + 20*64, s32);
|
||||
vst1q_f32(outbuf + 21*64, s33);
|
||||
vst1q_f32(outbuf + 22*64, s34);
|
||||
vst1q_f32(outbuf + 23*64, s35);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
|
||||
outbuf += winoAtomF32)
|
||||
{
|
||||
float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00;
|
||||
float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00;
|
||||
float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00;
|
||||
float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00;
|
||||
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
|
||||
wptr += winoKblock*winoAtomF32) {
|
||||
float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
|
||||
float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
|
||||
float32x4_t x0, x1, x2;
|
||||
x0 = vld1q_f32(inwptr);
|
||||
x1 = vld1q_f32(inwptr + 4);
|
||||
x2 = vld1q_f32(inwptr + 8);
|
||||
s00 = vfmaq_f32(s00, w0, x0);
|
||||
s01 = vfmaq_f32(s01, w0, x1);
|
||||
s02 = vfmaq_f32(s02, w0, x2);
|
||||
s10 = vfmaq_f32(s10, w1, x0);
|
||||
s11 = vfmaq_f32(s11, w1, x1);
|
||||
s12 = vfmaq_f32(s12, w1, x2);
|
||||
s20 = vfmaq_f32(s20, w2, x0);
|
||||
s21 = vfmaq_f32(s21, w2, x1);
|
||||
s22 = vfmaq_f32(s22, w2, x2);
|
||||
s30 = vfmaq_f32(s30, w3, x0);
|
||||
s31 = vfmaq_f32(s31, w3, x1);
|
||||
s32 = vfmaq_f32(s32, w3, x2);
|
||||
}
|
||||
|
||||
vst1q_f32(outbuf, s00);
|
||||
vst1q_f32(outbuf + 1*64, s01);
|
||||
vst1q_f32(outbuf + 2*64, s02);
|
||||
vst1q_f32(outbuf + 6*64, s10);
|
||||
vst1q_f32(outbuf + 7*64, s11);
|
||||
vst1q_f32(outbuf + 8*64, s12);
|
||||
vst1q_f32(outbuf + 12*64, s20);
|
||||
vst1q_f32(outbuf + 13*64, s21);
|
||||
vst1q_f32(outbuf + 14*64, s22);
|
||||
vst1q_f32(outbuf + 18*64, s30);
|
||||
vst1q_f32(outbuf + 19*64, s31);
|
||||
vst1q_f32(outbuf + 20*64, s32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef T4x4
|
||||
#define T4x4(a, b, c, d, tr0, tr1) \
|
||||
tr0 = vtrnq_f32(a, b); \
|
||||
tr1 = vtrnq_f32(c, d); \
|
||||
a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \
|
||||
b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \
|
||||
c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \
|
||||
d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1]))
|
||||
|
||||
/*Input transform*/
|
||||
void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
|
||||
float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
|
||||
{
|
||||
float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
|
||||
float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
|
||||
float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
|
||||
float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
|
||||
float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
|
||||
float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
|
||||
float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
|
||||
float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
|
||||
|
||||
float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
|
||||
|
||||
{
|
||||
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
|
||||
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
|
||||
float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11;
|
||||
t00 = vsubq_f32(x40, x20);
|
||||
t01 = vsubq_f32(x41, x21);
|
||||
t10 = vsubq_f32(x30, x50);
|
||||
t11 = vsubq_f32(x31, x51);
|
||||
float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25);
|
||||
float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25);
|
||||
float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25);
|
||||
float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25);
|
||||
|
||||
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
|
||||
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
|
||||
float32x4_t qm4_25 = vdupq_n_f32(-4.25f);
|
||||
t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25);
|
||||
t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25);
|
||||
t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25);
|
||||
t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25);
|
||||
|
||||
float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11);
|
||||
float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01);
|
||||
|
||||
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
|
||||
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
|
||||
float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f);
|
||||
float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f);
|
||||
t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5);
|
||||
t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5);
|
||||
t10 = vfmaq_f32(x60, x20, q0_25);
|
||||
t11 = vfmaq_f32(x61, x21, q0_25);
|
||||
t00 = vfmaq_f32(t00, x30, qm2_5);
|
||||
t01 = vfmaq_f32(t01, x31, qm2_5);
|
||||
t10 = vfmaq_f32(t10, x40, qm1_25);
|
||||
t11 = vfmaq_f32(t11, x41, qm1_25);
|
||||
|
||||
float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11);
|
||||
float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01);
|
||||
|
||||
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
|
||||
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
|
||||
float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f);
|
||||
t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5);
|
||||
t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5);
|
||||
t10 = vfmaq_f32(x60, x20, q4);
|
||||
t11 = vfmaq_f32(x61, x21, q4);
|
||||
t00 = vfmaq_f32(t00, x30, qm2_5);
|
||||
t01 = vfmaq_f32(t01, x31, qm2_5);
|
||||
t10 = vfmaq_f32(t10, x40, qm5);
|
||||
t11 = vfmaq_f32(t11, x41, qm5);
|
||||
|
||||
float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11);
|
||||
float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01);
|
||||
|
||||
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
|
||||
/* Y: */
|
||||
/* y00 y01 */
|
||||
/* y10 y11 */
|
||||
/* ... */
|
||||
/* y70 y71 */
|
||||
/* Y': */
|
||||
/* y00 y40 */
|
||||
/* y10 y50 */
|
||||
/* y20 y60 */
|
||||
/* y30 y70 */
|
||||
/* y01 y41 */
|
||||
/* y11 y51 */
|
||||
/* y21 y61 */
|
||||
/* y31 y71 */
|
||||
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
|
||||
float32x4x2_t tr0, tr1;
|
||||
|
||||
T4x4(y00, y10, y20, y30, tr0, tr1);
|
||||
T4x4(y01, y11, y21, y31, tr0, tr1);
|
||||
T4x4(y40, y50, y60, y70, tr0, tr1);
|
||||
T4x4(y41, y51, y61, y71, tr0, tr1);
|
||||
|
||||
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
|
||||
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
|
||||
t00 = vsubq_f32(y01, y20);
|
||||
t01 = vsubq_f32(y41, y60);
|
||||
t10 = vsubq_f32(y30, y11);
|
||||
t11 = vsubq_f32(y70, y51);
|
||||
z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25);
|
||||
z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25);
|
||||
z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25);
|
||||
z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25);
|
||||
|
||||
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
|
||||
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
|
||||
t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25);
|
||||
t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25);
|
||||
t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25);
|
||||
t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25);
|
||||
|
||||
z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11);
|
||||
z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01);
|
||||
|
||||
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
|
||||
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
|
||||
t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5);
|
||||
t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5);
|
||||
t10 = vfmaq_f32(y21, y20, q0_25);
|
||||
t11 = vfmaq_f32(y61, y60, q0_25);
|
||||
t00 = vfmaq_f32(t00, y30, qm2_5);
|
||||
t01 = vfmaq_f32(t01, y70, qm2_5);
|
||||
t10 = vfmaq_f32(t10, y01, qm1_25);
|
||||
t11 = vfmaq_f32(t11, y41, qm1_25);
|
||||
|
||||
z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11);
|
||||
z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01);
|
||||
|
||||
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
|
||||
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
|
||||
t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5);
|
||||
t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5);
|
||||
t10 = vfmaq_f32(y21, y20, q4);
|
||||
t11 = vfmaq_f32(y61, y60, q4);
|
||||
t00 = vfmaq_f32(t00, y30, qm2_5);
|
||||
t01 = vfmaq_f32(t01, y70, qm2_5);
|
||||
t10 = vfmaq_f32(t10, y01, qm5);
|
||||
t11 = vfmaq_f32(t11, y41, qm5);
|
||||
|
||||
z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11);
|
||||
z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01);
|
||||
}
|
||||
|
||||
const int outstep = winoIblock*winoAtomF32*Cg;
|
||||
|
||||
vst1q_f32(outptr, z00);
|
||||
vst1q_f32(outptr + outstep, z01);
|
||||
vst1q_f32(outptr + outstep*2, z10);
|
||||
vst1q_f32(outptr + outstep*3, z11);
|
||||
vst1q_f32(outptr + outstep*4, z20);
|
||||
vst1q_f32(outptr + outstep*5, z21);
|
||||
vst1q_f32(outptr + outstep*6, z30);
|
||||
vst1q_f32(outptr + outstep*7, z31);
|
||||
vst1q_f32(outptr + outstep*8, z40);
|
||||
vst1q_f32(outptr + outstep*9, z41);
|
||||
vst1q_f32(outptr + outstep*10, z50);
|
||||
vst1q_f32(outptr + outstep*11, z51);
|
||||
vst1q_f32(outptr + outstep*12, z60);
|
||||
vst1q_f32(outptr + outstep*13, z61);
|
||||
vst1q_f32(outptr + outstep*14, z70);
|
||||
vst1q_f32(outptr + outstep*15, z71);
|
||||
}
|
||||
|
||||
/*Output transform*/
|
||||
void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
|
||||
float* bpptr, int bpstep, float* outptr, int outstep,
|
||||
float bias, float minval, float maxval, bool ifMinMaxAct)
|
||||
{
|
||||
float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
|
||||
float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
|
||||
float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
|
||||
float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
|
||||
float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
|
||||
float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
|
||||
float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
|
||||
float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
|
||||
float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
|
||||
|
||||
{
|
||||
float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
|
||||
s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21);
|
||||
s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41);
|
||||
s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61);
|
||||
|
||||
float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0);
|
||||
float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1);
|
||||
float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
|
||||
float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
|
||||
float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
|
||||
float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
|
||||
|
||||
s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21);
|
||||
s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41);
|
||||
s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61);
|
||||
|
||||
float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0),
|
||||
s34_0, 32.f), s56_0, 1.f/32);
|
||||
float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1),
|
||||
s34_1, 32.f), s56_1, 1.f/32);
|
||||
float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
|
||||
float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
|
||||
float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
|
||||
float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
|
||||
float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
|
||||
|
||||
/* transpose 8x8 matrix in-place with some renumeration of the elements: */
|
||||
/* Y: */
|
||||
/* y00 y01 */
|
||||
/* y10 y11 */
|
||||
/* ... */
|
||||
/* y50 y51 */
|
||||
/* 0 0 */
|
||||
/* 0 0 */
|
||||
/* Y': */
|
||||
/* y00 y40 */
|
||||
/* y10 y50 */
|
||||
/* y20 y60 */
|
||||
/* y30 y70 */
|
||||
/* y01 y41 */
|
||||
/* y11 y51 */
|
||||
/* y21 y61 */
|
||||
/* y31 y71 */
|
||||
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
|
||||
float32x4x2_t tr0, tr1;
|
||||
|
||||
T4x4(y00, y10, y20, y30, tr0, tr1);
|
||||
T4x4(y01, y11, y21, y31, tr0, tr1);
|
||||
T4x4(y40, y50, y60, y70, tr0, tr1);
|
||||
T4x4(y41, y51, y61, y71, tr0, tr1);
|
||||
|
||||
s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60);
|
||||
s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41);
|
||||
s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61);
|
||||
|
||||
z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0);
|
||||
z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1);
|
||||
z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
|
||||
z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
|
||||
z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
|
||||
z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
|
||||
|
||||
s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60);
|
||||
s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41);
|
||||
s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61);
|
||||
|
||||
z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0),
|
||||
s34_0, 32.f), s56_0, 1.f/32);
|
||||
z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1),
|
||||
s34_1, 32.f), s56_1, 1.f/32);
|
||||
z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
|
||||
z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
|
||||
z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
|
||||
z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
|
||||
float32x4_t vbias = vdupq_n_f32(bias);
|
||||
|
||||
z00 = vaddq_f32(z00, vbias);
|
||||
z01 = vaddq_f32(z01, vbias);
|
||||
z10 = vaddq_f32(z10, vbias);
|
||||
z11 = vaddq_f32(z11, vbias);
|
||||
z20 = vaddq_f32(z20, vbias);
|
||||
z21 = vaddq_f32(z21, vbias);
|
||||
z30 = vaddq_f32(z30, vbias);
|
||||
z31 = vaddq_f32(z31, vbias);
|
||||
z40 = vaddq_f32(z40, vbias);
|
||||
z41 = vaddq_f32(z41, vbias);
|
||||
z50 = vaddq_f32(z50, vbias);
|
||||
z51 = vaddq_f32(z51, vbias);
|
||||
}
|
||||
|
||||
if (bpptr)
|
||||
{
|
||||
float32x2_t zhalf = vdup_n_f32(0.f);
|
||||
z00 = vaddq_f32(z00, vld1q_f32(bpptr));
|
||||
z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf));
|
||||
z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep));
|
||||
z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf));
|
||||
z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2));
|
||||
z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf));
|
||||
z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3));
|
||||
z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf));
|
||||
z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4));
|
||||
z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf));
|
||||
z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5));
|
||||
z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf));
|
||||
}
|
||||
|
||||
if (ifMinMaxAct)
|
||||
{
|
||||
float32x4_t vmax = vdupq_n_f32(maxval);
|
||||
float32x4_t vmin = vdupq_n_f32(minval);
|
||||
|
||||
z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax);
|
||||
z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax);
|
||||
z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax);
|
||||
z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax);
|
||||
z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax);
|
||||
z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax);
|
||||
z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax);
|
||||
z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax);
|
||||
z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax);
|
||||
z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax);
|
||||
z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax);
|
||||
z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax);
|
||||
}
|
||||
|
||||
vst1q_f32(outptr, z00);
|
||||
vst1_f32(outptr + 4, vget_low_f32(z01));
|
||||
vst1q_f32(outptr + outstep, z10);
|
||||
vst1_f32(outptr + outstep + 4, vget_low_f32(z11));
|
||||
vst1q_f32(outptr + outstep*2, z20);
|
||||
vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21));
|
||||
vst1q_f32(outptr + outstep*3, z30);
|
||||
vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31));
|
||||
vst1q_f32(outptr + outstep*4, z40);
|
||||
vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41));
|
||||
vst1q_f32(outptr + outstep*5, z50);
|
||||
vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51));
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
}} // namespace
|
File diff suppressed because it is too large
Load Diff
@ -6,6 +6,7 @@
|
||||
#define OPENCV_FAST_CONVOLUTION_HPP
|
||||
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
#include "opencv2/dnn/all_layers.hpp"
|
||||
|
||||
#ifndef CONV_PRAM
|
||||
#define CONV_PRAM
|
||||
@ -119,25 +120,30 @@ void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bo
|
||||
|
||||
void convBlockMR1_F32(int np, const float* a, const float* b, float* c, const float bias, bool init_c,
|
||||
const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR);
|
||||
|
||||
#if CV_NEON_AARCH64
|
||||
/* Accumulate */
|
||||
void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
|
||||
const int winoIblock, const int winoKblock, const int winoAtom, const int winoNatom);
|
||||
|
||||
/*Input transform*/
|
||||
void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
|
||||
float* outptr, int Cg, const int winoIblock, const int winoAtom);
|
||||
|
||||
/*Output transform*/
|
||||
void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
|
||||
float* bpptr, int bpstep, float* outptr, int outstep,
|
||||
float bias, float minval, float maxval, bool ifMinMaxAct);
|
||||
#endif // CV_NEON_AARCH64
|
||||
#endif // CV_NEON
|
||||
} // namespace opt_NEON.
|
||||
|
||||
|
||||
|
||||
// === Function tables
|
||||
struct Winofunc
|
||||
{
|
||||
void (*accum)(const uchar* inwptr, const uchar* wptr, uchar* outbuf, int Cg, int iblock, const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
|
||||
void (*BtXB_8x8)(const float* inptr, int inpstep, uchar* outptr, int Cg, const int winoIblock, const int winoAtomF32);
|
||||
void (*AtXA_8x8)(const uchar* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep, float bias, float minval, float maxval, bool ifMinMaxAct);
|
||||
int iblock;
|
||||
int natom;
|
||||
int esz;
|
||||
|
||||
bool isGood() const { return accum && BtXB_8x8 && AtXA_8x8 && iblock > 0 && natom > 0 && esz > 0; }
|
||||
static Winofunc empty() { return {0, 0, 0, 0, 0, 0}; }
|
||||
};
|
||||
|
||||
// === wrapper calls (implemented in .dispatch.cpp)
|
||||
Winofunc getWinofunc_F32();
|
||||
Winofunc getWinofunc_F16();
|
||||
|
||||
|
||||
} // namespace dnn
|
||||
} // namespace cv
|
||||
|
||||
|
@ -1,235 +0,0 @@
|
||||
/***********************************************************************
|
||||
* Software License Agreement (BSD License)
|
||||
*
|
||||
* Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
|
||||
* Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*************************************************************************/
|
||||
|
||||
|
||||
#ifndef OPENCV_FLANN_HDF5_H_
|
||||
#define OPENCV_FLANN_HDF5_H_
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
#include <hdf5.h>
|
||||
|
||||
#include "matrix.h"
|
||||
|
||||
|
||||
namespace cvflann
|
||||
{
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
template<typename T>
|
||||
hid_t get_hdf5_type()
|
||||
{
|
||||
throw FLANNException("Unsupported type for IO operations");
|
||||
}
|
||||
|
||||
template<>
|
||||
hid_t get_hdf5_type<char>() { return H5T_NATIVE_CHAR; }
|
||||
template<>
|
||||
hid_t get_hdf5_type<unsigned char>() { return H5T_NATIVE_UCHAR; }
|
||||
template<>
|
||||
hid_t get_hdf5_type<short int>() { return H5T_NATIVE_SHORT; }
|
||||
template<>
|
||||
hid_t get_hdf5_type<unsigned short int>() { return H5T_NATIVE_USHORT; }
|
||||
template<>
|
||||
hid_t get_hdf5_type<int>() { return H5T_NATIVE_INT; }
|
||||
template<>
|
||||
hid_t get_hdf5_type<unsigned int>() { return H5T_NATIVE_UINT; }
|
||||
template<>
|
||||
hid_t get_hdf5_type<long>() { return H5T_NATIVE_LONG; }
|
||||
template<>
|
||||
hid_t get_hdf5_type<unsigned long>() { return H5T_NATIVE_ULONG; }
|
||||
template<>
|
||||
hid_t get_hdf5_type<float>() { return H5T_NATIVE_FLOAT; }
|
||||
template<>
|
||||
hid_t get_hdf5_type<double>() { return H5T_NATIVE_DOUBLE; }
|
||||
}
|
||||
|
||||
|
||||
#define CHECK_ERROR(x,y) if ((x)<0) throw FLANNException((y));
|
||||
|
||||
template<typename T>
|
||||
void save_to_file(const cvflann::Matrix<T>& dataset, const String& filename, const String& name)
|
||||
{
|
||||
|
||||
#if H5Eset_auto_vers == 2
|
||||
H5Eset_auto( H5E_DEFAULT, NULL, NULL );
|
||||
#else
|
||||
H5Eset_auto( NULL, NULL );
|
||||
#endif
|
||||
|
||||
herr_t status;
|
||||
hid_t file_id;
|
||||
file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
|
||||
if (file_id < 0) {
|
||||
file_id = H5Fcreate(filename.c_str(), H5F_ACC_EXCL, H5P_DEFAULT, H5P_DEFAULT);
|
||||
}
|
||||
CHECK_ERROR(file_id,"Error creating hdf5 file.");
|
||||
|
||||
hsize_t dimsf[2]; // dataset dimensions
|
||||
dimsf[0] = dataset.rows;
|
||||
dimsf[1] = dataset.cols;
|
||||
|
||||
hid_t space_id = H5Screate_simple(2, dimsf, NULL);
|
||||
hid_t memspace_id = H5Screate_simple(2, dimsf, NULL);
|
||||
|
||||
hid_t dataset_id;
|
||||
#if H5Dcreate_vers == 2
|
||||
dataset_id = H5Dcreate2(file_id, name.c_str(), get_hdf5_type<T>(), space_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
|
||||
#else
|
||||
dataset_id = H5Dcreate(file_id, name.c_str(), get_hdf5_type<T>(), space_id, H5P_DEFAULT);
|
||||
#endif
|
||||
|
||||
if (dataset_id<0) {
|
||||
#if H5Dopen_vers == 2
|
||||
dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
|
||||
#else
|
||||
dataset_id = H5Dopen(file_id, name.c_str());
|
||||
#endif
|
||||
}
|
||||
CHECK_ERROR(dataset_id,"Error creating or opening dataset in file.");
|
||||
|
||||
status = H5Dwrite(dataset_id, get_hdf5_type<T>(), memspace_id, space_id, H5P_DEFAULT, dataset.data );
|
||||
CHECK_ERROR(status, "Error writing to dataset");
|
||||
|
||||
H5Sclose(memspace_id);
|
||||
H5Sclose(space_id);
|
||||
H5Dclose(dataset_id);
|
||||
H5Fclose(file_id);
|
||||
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
void load_from_file(cvflann::Matrix<T>& dataset, const String& filename, const String& name)
|
||||
{
|
||||
herr_t status;
|
||||
hid_t file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
|
||||
CHECK_ERROR(file_id,"Error opening hdf5 file.");
|
||||
|
||||
hid_t dataset_id;
|
||||
#if H5Dopen_vers == 2
|
||||
dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
|
||||
#else
|
||||
dataset_id = H5Dopen(file_id, name.c_str());
|
||||
#endif
|
||||
CHECK_ERROR(dataset_id,"Error opening dataset in file.");
|
||||
|
||||
hid_t space_id = H5Dget_space(dataset_id);
|
||||
|
||||
hsize_t dims_out[2];
|
||||
H5Sget_simple_extent_dims(space_id, dims_out, NULL);
|
||||
|
||||
dataset = cvflann::Matrix<T>(new T[dims_out[0]*dims_out[1]], dims_out[0], dims_out[1]);
|
||||
|
||||
status = H5Dread(dataset_id, get_hdf5_type<T>(), H5S_ALL, H5S_ALL, H5P_DEFAULT, dataset[0]);
|
||||
CHECK_ERROR(status, "Error reading dataset");
|
||||
|
||||
H5Sclose(space_id);
|
||||
H5Dclose(dataset_id);
|
||||
H5Fclose(file_id);
|
||||
}
|
||||
|
||||
|
||||
#ifdef HAVE_MPI
|
||||
|
||||
namespace mpi
|
||||
{
|
||||
/**
|
||||
* Loads a the hyperslice corresponding to this processor from a hdf5 file.
|
||||
* @param flann_dataset Dataset where the data is loaded
|
||||
* @param filename HDF5 file name
|
||||
* @param name Name of dataset inside file
|
||||
*/
|
||||
template<typename T>
|
||||
void load_from_file(cvflann::Matrix<T>& dataset, const String& filename, const String& name)
|
||||
{
|
||||
MPI_Comm comm = MPI_COMM_WORLD;
|
||||
MPI_Info info = MPI_INFO_NULL;
|
||||
|
||||
int mpi_size, mpi_rank;
|
||||
MPI_Comm_size(comm, &mpi_size);
|
||||
MPI_Comm_rank(comm, &mpi_rank);
|
||||
|
||||
herr_t status;
|
||||
|
||||
hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
|
||||
H5Pset_fapl_mpio(plist_id, comm, info);
|
||||
hid_t file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, plist_id);
|
||||
CHECK_ERROR(file_id,"Error opening hdf5 file.");
|
||||
H5Pclose(plist_id);
|
||||
hid_t dataset_id;
|
||||
#if H5Dopen_vers == 2
|
||||
dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
|
||||
#else
|
||||
dataset_id = H5Dopen(file_id, name.c_str());
|
||||
#endif
|
||||
CHECK_ERROR(dataset_id,"Error opening dataset in file.");
|
||||
|
||||
hid_t space_id = H5Dget_space(dataset_id);
|
||||
hsize_t dims[2];
|
||||
H5Sget_simple_extent_dims(space_id, dims, NULL);
|
||||
|
||||
hsize_t count[2];
|
||||
hsize_t offset[2];
|
||||
|
||||
hsize_t item_cnt = dims[0]/mpi_size+(dims[0]%mpi_size==0 ? 0 : 1);
|
||||
hsize_t cnt = (mpi_rank<mpi_size-1 ? item_cnt : dims[0]-item_cnt*(mpi_size-1));
|
||||
|
||||
count[0] = cnt;
|
||||
count[1] = dims[1];
|
||||
offset[0] = mpi_rank*item_cnt;
|
||||
offset[1] = 0;
|
||||
|
||||
hid_t memspace_id = H5Screate_simple(2,count,NULL);
|
||||
|
||||
H5Sselect_hyperslab(space_id, H5S_SELECT_SET, offset, NULL, count, NULL);
|
||||
|
||||
dataset.rows = count[0];
|
||||
dataset.cols = count[1];
|
||||
dataset.data = new T[dataset.rows*dataset.cols];
|
||||
|
||||
plist_id = H5Pcreate(H5P_DATASET_XFER);
|
||||
H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
|
||||
status = H5Dread(dataset_id, get_hdf5_type<T>(), memspace_id, space_id, plist_id, dataset.data);
|
||||
CHECK_ERROR(status, "Error reading dataset");
|
||||
|
||||
H5Pclose(plist_id);
|
||||
H5Sclose(space_id);
|
||||
H5Sclose(memspace_id);
|
||||
H5Dclose(dataset_id);
|
||||
H5Fclose(file_id);
|
||||
}
|
||||
}
|
||||
#endif // HAVE_MPI
|
||||
} // namespace cvflann::mpi
|
||||
|
||||
//! @endcond
|
||||
|
||||
#endif /* OPENCV_FLANN_HDF5_H_ */
|
@ -190,7 +190,7 @@ endif()
|
||||
if(TARGET opencv_test_imgcodecs AND HAVE_OPENEXR AND "$ENV{OPENCV_IO_ENABLE_OPENEXR}")
|
||||
ocv_target_compile_definitions(opencv_test_imgcodecs PRIVATE OPENCV_IMGCODECS_ENABLE_OPENEXR_TESTS=1)
|
||||
endif()
|
||||
if(TARGET opencv_test_imgcodecs AND ((HAVE_PNG AND NOT (PNG_VERSION VERSION_LESS "1.6.31")) OR HAVE_SPNG))
|
||||
if(TARGET opencv_test_imgcodecs AND ((HAVE_PNG AND NOT (PNG_VERSION_STRING VERSION_LESS "1.6.31")) OR HAVE_SPNG))
|
||||
# details: https://github.com/glennrp/libpng/commit/68cb0aaee3de6371b81a4613476d9b33e43e95b1
|
||||
ocv_target_compile_definitions(opencv_test_imgcodecs PRIVATE OPENCV_IMGCODECS_PNG_WITH_EXIF=1)
|
||||
endif()
|
||||
|
@ -754,7 +754,10 @@ bool ExrEncoder::write( const Mat& img, const std::vector<int>& params )
|
||||
case IMWRITE_EXR_COMPRESSION_B44A:
|
||||
header.compression() = B44A_COMPRESSION;
|
||||
break;
|
||||
#if ((OPENEXR_VERSION_MAJOR * 1000 + OPENEXR_VERSION_MINOR) >= (2 * 1000 + 2)) // available since version 2.2.0
|
||||
// version macros introduced in openexr 2.0.1.
|
||||
// - https://github.com/AcademySoftwareFoundation/openexr/commit/60cdff8a6f5c4e25a374e5f366d6e9b4efd869b3#diff-c4bae0726aebe410e407db9abd406d9cf2684f82dd8a08f46d84e8b7c35cf22aR67
|
||||
#if defined(OPENEXR_VERSION_MAJOR) && defined(OPENEXR_VERSION_MINOR) && OPENEXR_VERSION_MAJOR * 1000 + OPENEXR_VERSION_MINOR >= 2 * 1000 + 2
|
||||
// available since version 2.2.0
|
||||
case IMWRITE_EXR_COMPRESSION_DWAA:
|
||||
header.compression() = DWAA_COMPRESSION;
|
||||
break;
|
||||
@ -768,10 +771,12 @@ bool ExrEncoder::write( const Mat& img, const std::vector<int>& params )
|
||||
}
|
||||
if (params[i] == IMWRITE_EXR_DWA_COMPRESSION_LEVEL)
|
||||
{
|
||||
#if OPENEXR_VERSION_MAJOR >= 3
|
||||
header.dwaCompressionLevel() = params[i + 1];
|
||||
#else
|
||||
#if !defined(OPENEXR_VERSION_MAJOR)
|
||||
CV_LOG_ONCE_WARNING(NULL, "Setting `IMWRITE_EXR_DWA_COMPRESSION_LEVEL` not supported in unknown OpenEXR version possibly prior to 2.0.1 (version 3 is required)");
|
||||
#elif OPENEXR_VERSION_MAJOR < 3
|
||||
CV_LOG_ONCE_WARNING(NULL, "Setting `IMWRITE_EXR_DWA_COMPRESSION_LEVEL` not supported in OpenEXR version " + std::to_string(OPENEXR_VERSION_MAJOR) + " (version 3 is required)");
|
||||
#else
|
||||
header.dwaCompressionLevel() = params[i + 1];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -83,6 +83,9 @@ static Size validateInputImageSize(const Size& size)
|
||||
|
||||
static inline int calcType(int type, int flags)
|
||||
{
|
||||
if ( (flags & (IMREAD_COLOR | IMREAD_ANYCOLOR | IMREAD_ANYDEPTH)) == (IMREAD_COLOR | IMREAD_ANYCOLOR | IMREAD_ANYDEPTH))
|
||||
return type;
|
||||
|
||||
if( (flags & IMREAD_LOAD_GDAL) != IMREAD_LOAD_GDAL && flags != IMREAD_UNCHANGED )
|
||||
{
|
||||
if( (flags & IMREAD_ANYDEPTH) == 0 )
|
||||
|
@ -187,51 +187,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef testing::TestWithParam<string> Imgcodecs_AVIF_Exif;
|
||||
|
||||
TEST_P(Imgcodecs_AVIF_Exif, exif_orientation) {
|
||||
const string root = cvtest::TS::ptr()->get_data_path();
|
||||
const string filename = root + GetParam();
|
||||
const int colorThresholdHigh = 250;
|
||||
const int colorThresholdLow = 5;
|
||||
|
||||
Mat m_img = imread(filename);
|
||||
ASSERT_FALSE(m_img.empty());
|
||||
Vec3b vec;
|
||||
|
||||
// Checking the first quadrant (with supposed red)
|
||||
vec = m_img.at<Vec3b>(2, 2); // some point inside the square
|
||||
EXPECT_LE(vec.val[0], colorThresholdLow);
|
||||
EXPECT_LE(vec.val[1], colorThresholdLow);
|
||||
EXPECT_GE(vec.val[2], colorThresholdHigh);
|
||||
|
||||
// Checking the second quadrant (with supposed green)
|
||||
vec = m_img.at<Vec3b>(2, 7); // some point inside the square
|
||||
EXPECT_LE(vec.val[0], colorThresholdLow);
|
||||
EXPECT_GE(vec.val[1], colorThresholdHigh);
|
||||
EXPECT_LE(vec.val[2], colorThresholdLow);
|
||||
|
||||
// Checking the third quadrant (with supposed blue)
|
||||
vec = m_img.at<Vec3b>(7, 2); // some point inside the square
|
||||
EXPECT_GE(vec.val[0], colorThresholdHigh);
|
||||
EXPECT_LE(vec.val[1], colorThresholdLow);
|
||||
EXPECT_LE(vec.val[2], colorThresholdLow);
|
||||
}
|
||||
|
||||
const string exif_files[] = {"readwrite/testExifOrientation_1.avif",
|
||||
"readwrite/testExifOrientation_2.avif",
|
||||
"readwrite/testExifOrientation_3.avif",
|
||||
"readwrite/testExifOrientation_4.avif",
|
||||
"readwrite/testExifOrientation_5.avif",
|
||||
"readwrite/testExifOrientation_6.avif",
|
||||
"readwrite/testExifOrientation_7.avif",
|
||||
"readwrite/testExifOrientation_8.avif"};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(ExifFiles, Imgcodecs_AVIF_Exif,
|
||||
testing::ValuesIn(exif_files));
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class Imgcodecs_Avif_Animation_RoundTripSuite
|
||||
: public Imgcodecs_Avif_RoundTripSuite {
|
||||
public:
|
||||
|
151
modules/imgcodecs/test/test_exif.cpp
Normal file
151
modules/imgcodecs/test/test_exif.cpp
Normal file
@ -0,0 +1,151 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level
|
||||
// directory of this distribution and at http://opencv.org/license.html
|
||||
|
||||
#include "test_precomp.hpp"
|
||||
|
||||
namespace opencv_test { namespace {
|
||||
|
||||
/**
|
||||
* Test to check whether the EXIF orientation tag was processed successfully or not.
|
||||
* The test uses a set of 8 images named testExifOrientation_{1 to 8}.(extension).
|
||||
* Each test image is a 10x10 square, divided into four smaller sub-squares:
|
||||
* (R corresponds to Red, G to Green, B to Blue, W to White)
|
||||
* --------- ---------
|
||||
* | R | G | | G | R |
|
||||
* |-------| - (tag 1) |-------| - (tag 2)
|
||||
* | B | W | | W | B |
|
||||
* --------- ---------
|
||||
*
|
||||
* --------- ---------
|
||||
* | W | B | | B | W |
|
||||
* |-------| - (tag 3) |-------| - (tag 4)
|
||||
* | G | R | | R | G |
|
||||
* --------- ---------
|
||||
*
|
||||
* --------- ---------
|
||||
* | R | B | | G | W |
|
||||
* |-------| - (tag 5) |-------| - (tag 6)
|
||||
* | G | W | | R | B |
|
||||
* --------- ---------
|
||||
*
|
||||
* --------- ---------
|
||||
* | W | G | | B | R |
|
||||
* |-------| - (tag 7) |-------| - (tag 8)
|
||||
* | B | R | | W | G |
|
||||
* --------- ---------
|
||||
*
|
||||
*
|
||||
* Each image contains an EXIF field with an orientation tag (0x112).
|
||||
* After reading each image and applying the orientation tag,
|
||||
* the resulting image should be:
|
||||
* ---------
|
||||
* | R | G |
|
||||
* |-------|
|
||||
* | B | W |
|
||||
* ---------
|
||||
*
|
||||
* Note:
|
||||
* The flags parameter of the imread function is set as IMREAD_COLOR | IMREAD_ANYCOLOR | IMREAD_ANYDEPTH.
|
||||
* Using this combination is an undocumented trick to load images similarly to the IMREAD_UNCHANGED flag,
|
||||
* preserving the alpha channel (if present) while also applying the orientation.
|
||||
*/
|
||||
|
||||
typedef testing::TestWithParam<string> Exif;
|
||||
|
||||
TEST_P(Exif, exif_orientation)
|
||||
{
|
||||
const string root = cvtest::TS::ptr()->get_data_path();
|
||||
const string filename = root + GetParam();
|
||||
const int colorThresholdHigh = 250;
|
||||
const int colorThresholdLow = 5;
|
||||
|
||||
// Refer to the note in the explanation above.
|
||||
Mat m_img = imread(filename, IMREAD_COLOR | IMREAD_ANYCOLOR | IMREAD_ANYDEPTH);
|
||||
ASSERT_FALSE(m_img.empty());
|
||||
|
||||
if (m_img.channels() == 3)
|
||||
{
|
||||
Vec3b vec;
|
||||
|
||||
//Checking the first quadrant (with supposed red)
|
||||
vec = m_img.at<Vec3b>(2, 2); //some point inside the square
|
||||
EXPECT_LE(vec.val[0], colorThresholdLow);
|
||||
EXPECT_LE(vec.val[1], colorThresholdLow);
|
||||
EXPECT_GE(vec.val[2], colorThresholdHigh);
|
||||
|
||||
//Checking the second quadrant (with supposed green)
|
||||
vec = m_img.at<Vec3b>(2, 7); //some point inside the square
|
||||
EXPECT_LE(vec.val[0], colorThresholdLow);
|
||||
EXPECT_GE(vec.val[1], colorThresholdHigh);
|
||||
EXPECT_LE(vec.val[2], colorThresholdLow);
|
||||
|
||||
//Checking the third quadrant (with supposed blue)
|
||||
vec = m_img.at<Vec3b>(7, 2); //some point inside the square
|
||||
EXPECT_GE(vec.val[0], colorThresholdHigh);
|
||||
EXPECT_LE(vec.val[1], colorThresholdLow);
|
||||
EXPECT_LE(vec.val[2], colorThresholdLow);
|
||||
}
|
||||
else
|
||||
{
|
||||
Vec4b vec;
|
||||
|
||||
//Checking the first quadrant (with supposed red)
|
||||
vec = m_img.at<Vec4b>(2, 2); //some point inside the square
|
||||
EXPECT_LE(vec.val[0], colorThresholdLow);
|
||||
EXPECT_LE(vec.val[1], colorThresholdLow);
|
||||
EXPECT_GE(vec.val[2], colorThresholdHigh);
|
||||
|
||||
//Checking the second quadrant (with supposed green)
|
||||
vec = m_img.at<Vec4b>(2, 7); //some point inside the square
|
||||
EXPECT_LE(vec.val[0], colorThresholdLow);
|
||||
EXPECT_GE(vec.val[1], colorThresholdHigh);
|
||||
EXPECT_LE(vec.val[2], colorThresholdLow);
|
||||
|
||||
//Checking the third quadrant (with supposed blue)
|
||||
vec = m_img.at<Vec4b>(7, 2); //some point inside the square
|
||||
EXPECT_GE(vec.val[0], colorThresholdHigh);
|
||||
EXPECT_LE(vec.val[1], colorThresholdLow);
|
||||
EXPECT_LE(vec.val[2], colorThresholdLow);
|
||||
}
|
||||
}
|
||||
|
||||
const string exif_files[] =
|
||||
{
|
||||
#ifdef HAVE_JPEG
|
||||
"readwrite/testExifOrientation_1.jpg",
|
||||
"readwrite/testExifOrientation_2.jpg",
|
||||
"readwrite/testExifOrientation_3.jpg",
|
||||
"readwrite/testExifOrientation_4.jpg",
|
||||
"readwrite/testExifOrientation_5.jpg",
|
||||
"readwrite/testExifOrientation_6.jpg",
|
||||
"readwrite/testExifOrientation_7.jpg",
|
||||
"readwrite/testExifOrientation_8.jpg",
|
||||
#endif
|
||||
#ifdef OPENCV_IMGCODECS_PNG_WITH_EXIF
|
||||
"readwrite/testExifOrientation_1.png",
|
||||
"readwrite/testExifOrientation_2.png",
|
||||
"readwrite/testExifOrientation_3.png",
|
||||
"readwrite/testExifOrientation_4.png",
|
||||
"readwrite/testExifOrientation_5.png",
|
||||
"readwrite/testExifOrientation_6.png",
|
||||
"readwrite/testExifOrientation_7.png",
|
||||
"readwrite/testExifOrientation_8.png",
|
||||
#endif
|
||||
#ifdef HAVE_AVIF
|
||||
"readwrite/testExifOrientation_1.avif",
|
||||
"readwrite/testExifOrientation_2.avif",
|
||||
"readwrite/testExifOrientation_3.avif",
|
||||
"readwrite/testExifOrientation_4.avif",
|
||||
"readwrite/testExifOrientation_5.avif",
|
||||
"readwrite/testExifOrientation_6.avif",
|
||||
"readwrite/testExifOrientation_7.avif",
|
||||
"readwrite/testExifOrientation_8.avif",
|
||||
#endif
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Imgcodecs, Exif,
|
||||
testing::ValuesIn(exif_files));
|
||||
|
||||
}
|
||||
}
|
@ -11,95 +11,6 @@ extern "C" {
|
||||
#include "jpeglib.h"
|
||||
}
|
||||
|
||||
/**
|
||||
* Test for check whether reading exif orientation tag was processed successfully or not
|
||||
* The test info is the set of 8 images named testExifRotate_{1 to 8}.jpg
|
||||
* The test image is the square 10x10 points divided by four sub-squares:
|
||||
* (R corresponds to Red, G to Green, B to Blue, W to white)
|
||||
* --------- ---------
|
||||
* | R | G | | G | R |
|
||||
* |-------| - (tag 1) |-------| - (tag 2)
|
||||
* | B | W | | W | B |
|
||||
* --------- ---------
|
||||
*
|
||||
* --------- ---------
|
||||
* | W | B | | B | W |
|
||||
* |-------| - (tag 3) |-------| - (tag 4)
|
||||
* | G | R | | R | G |
|
||||
* --------- ---------
|
||||
*
|
||||
* --------- ---------
|
||||
* | R | B | | G | W |
|
||||
* |-------| - (tag 5) |-------| - (tag 6)
|
||||
* | G | W | | R | B |
|
||||
* --------- ---------
|
||||
*
|
||||
* --------- ---------
|
||||
* | W | G | | B | R |
|
||||
* |-------| - (tag 7) |-------| - (tag 8)
|
||||
* | B | R | | W | G |
|
||||
* --------- ---------
|
||||
*
|
||||
*
|
||||
* Every image contains exif field with orientation tag (0x112)
|
||||
* After reading each image the corresponding matrix must be read as
|
||||
* ---------
|
||||
* | R | G |
|
||||
* |-------|
|
||||
* | B | W |
|
||||
* ---------
|
||||
*
|
||||
*/
|
||||
|
||||
typedef testing::TestWithParam<string> Imgcodecs_Jpeg_Exif;
|
||||
|
||||
TEST_P(Imgcodecs_Jpeg_Exif, exif_orientation)
|
||||
{
|
||||
const string root = cvtest::TS::ptr()->get_data_path();
|
||||
const string filename = root + GetParam();
|
||||
const int colorThresholdHigh = 250;
|
||||
const int colorThresholdLow = 5;
|
||||
|
||||
Mat m_img = imread(filename);
|
||||
ASSERT_FALSE(m_img.empty());
|
||||
Vec3b vec;
|
||||
|
||||
//Checking the first quadrant (with supposed red)
|
||||
vec = m_img.at<Vec3b>(2, 2); //some point inside the square
|
||||
EXPECT_LE(vec.val[0], colorThresholdLow);
|
||||
EXPECT_LE(vec.val[1], colorThresholdLow);
|
||||
EXPECT_GE(vec.val[2], colorThresholdHigh);
|
||||
|
||||
//Checking the second quadrant (with supposed green)
|
||||
vec = m_img.at<Vec3b>(2, 7); //some point inside the square
|
||||
EXPECT_LE(vec.val[0], colorThresholdLow);
|
||||
EXPECT_GE(vec.val[1], colorThresholdHigh);
|
||||
EXPECT_LE(vec.val[2], colorThresholdLow);
|
||||
|
||||
//Checking the third quadrant (with supposed blue)
|
||||
vec = m_img.at<Vec3b>(7, 2); //some point inside the square
|
||||
EXPECT_GE(vec.val[0], colorThresholdHigh);
|
||||
EXPECT_LE(vec.val[1], colorThresholdLow);
|
||||
EXPECT_LE(vec.val[2], colorThresholdLow);
|
||||
}
|
||||
|
||||
const string exif_files[] =
|
||||
{
|
||||
"readwrite/testExifOrientation_1.jpg",
|
||||
"readwrite/testExifOrientation_2.jpg",
|
||||
"readwrite/testExifOrientation_3.jpg",
|
||||
"readwrite/testExifOrientation_4.jpg",
|
||||
"readwrite/testExifOrientation_5.jpg",
|
||||
"readwrite/testExifOrientation_6.jpg",
|
||||
"readwrite/testExifOrientation_7.jpg",
|
||||
"readwrite/testExifOrientation_8.jpg"
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(ExifFiles, Imgcodecs_Jpeg_Exif,
|
||||
testing::ValuesIn(exif_files));
|
||||
|
||||
//==================================================================================================
|
||||
|
||||
TEST(Imgcodecs_Jpeg, encode_empty)
|
||||
{
|
||||
cv::Mat img;
|
||||
|
@ -109,100 +109,6 @@ TEST(Imgcodecs_Png, read_color_palette_with_alpha)
|
||||
EXPECT_EQ(img.at<Vec3b>(0, 1), Vec3b(255, 0, 0));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test for check whether reading exif orientation tag was processed successfully or not
|
||||
* The test info is the set of 8 images named testExifRotate_{1 to 8}.png
|
||||
* The test image is the square 10x10 points divided by four sub-squares:
|
||||
* (R corresponds to Red, G to Green, B to Blue, W to white)
|
||||
* --------- ---------
|
||||
* | R | G | | G | R |
|
||||
* |-------| - (tag 1) |-------| - (tag 2)
|
||||
* | B | W | | W | B |
|
||||
* --------- ---------
|
||||
*
|
||||
* --------- ---------
|
||||
* | W | B | | B | W |
|
||||
* |-------| - (tag 3) |-------| - (tag 4)
|
||||
* | G | R | | R | G |
|
||||
* --------- ---------
|
||||
*
|
||||
* --------- ---------
|
||||
* | R | B | | G | W |
|
||||
* |-------| - (tag 5) |-------| - (tag 6)
|
||||
* | G | W | | R | B |
|
||||
* --------- ---------
|
||||
*
|
||||
* --------- ---------
|
||||
* | W | G | | B | R |
|
||||
* |-------| - (tag 7) |-------| - (tag 8)
|
||||
* | B | R | | W | G |
|
||||
* --------- ---------
|
||||
*
|
||||
*
|
||||
* Every image contains exif field with orientation tag (0x112)
|
||||
* After reading each image and applying the orientation tag,
|
||||
* the resulting image should be:
|
||||
* ---------
|
||||
* | R | G |
|
||||
* |-------|
|
||||
* | B | W |
|
||||
* ---------
|
||||
*
|
||||
*/
|
||||
|
||||
typedef testing::TestWithParam<string> Imgcodecs_PNG_Exif;
|
||||
|
||||
// Solution to issue 16579: PNG read doesn't support Exif orientation data
|
||||
#ifdef OPENCV_IMGCODECS_PNG_WITH_EXIF
|
||||
TEST_P(Imgcodecs_PNG_Exif, exif_orientation)
|
||||
#else
|
||||
TEST_P(Imgcodecs_PNG_Exif, DISABLED_exif_orientation)
|
||||
#endif
|
||||
{
|
||||
const string root = cvtest::TS::ptr()->get_data_path();
|
||||
const string filename = root + GetParam();
|
||||
const int colorThresholdHigh = 250;
|
||||
const int colorThresholdLow = 5;
|
||||
|
||||
Mat m_img = imread(filename);
|
||||
ASSERT_FALSE(m_img.empty());
|
||||
Vec3b vec;
|
||||
|
||||
//Checking the first quadrant (with supposed red)
|
||||
vec = m_img.at<Vec3b>(2, 2); //some point inside the square
|
||||
EXPECT_LE(vec.val[0], colorThresholdLow);
|
||||
EXPECT_LE(vec.val[1], colorThresholdLow);
|
||||
EXPECT_GE(vec.val[2], colorThresholdHigh);
|
||||
|
||||
//Checking the second quadrant (with supposed green)
|
||||
vec = m_img.at<Vec3b>(2, 7); //some point inside the square
|
||||
EXPECT_LE(vec.val[0], colorThresholdLow);
|
||||
EXPECT_GE(vec.val[1], colorThresholdHigh);
|
||||
EXPECT_LE(vec.val[2], colorThresholdLow);
|
||||
|
||||
//Checking the third quadrant (with supposed blue)
|
||||
vec = m_img.at<Vec3b>(7, 2); //some point inside the square
|
||||
EXPECT_GE(vec.val[0], colorThresholdHigh);
|
||||
EXPECT_LE(vec.val[1], colorThresholdLow);
|
||||
EXPECT_LE(vec.val[2], colorThresholdLow);
|
||||
}
|
||||
|
||||
const string exif_files[] =
|
||||
{
|
||||
"readwrite/testExifOrientation_1.png",
|
||||
"readwrite/testExifOrientation_2.png",
|
||||
"readwrite/testExifOrientation_3.png",
|
||||
"readwrite/testExifOrientation_4.png",
|
||||
"readwrite/testExifOrientation_5.png",
|
||||
"readwrite/testExifOrientation_6.png",
|
||||
"readwrite/testExifOrientation_7.png",
|
||||
"readwrite/testExifOrientation_8.png"
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(ExifFiles, Imgcodecs_PNG_Exif,
|
||||
testing::ValuesIn(exif_files));
|
||||
|
||||
|
||||
typedef testing::TestWithParam<string> Imgcodecs_Png_PngSuite;
|
||||
|
||||
TEST_P(Imgcodecs_Png_PngSuite, decode)
|
||||
|
Loading…
Reference in New Issue
Block a user