mirror of
https://github.com/opencv/opencv.git
synced 2025-06-09 02:23:23 +08:00
128 lines
3.9 KiB
C++
128 lines
3.9 KiB
C++
// This file is part of OpenCV project.
|
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
// of this distribution and at http://opencv.org/license.html.
|
|
|
|
#ifndef OPENCV_FAST_CONVOLUTION_HPP
|
|
#define OPENCV_FAST_CONVOLUTION_HPP
|
|
|
|
#include "opencv2/core/hal/intrin.hpp"
|
|
|
|
#ifndef CONV_PRAM
|
|
#define CONV_PRAM
|
|
#if CV_NEON && CV_NEON_AARCH64 // 32 registers.
|
|
#define CONV_MR 4
|
|
#define CONV_NR 28
|
|
enum { FAST_VEC_NLANES=4 };
|
|
#elif CV_NEON // 16 registers.
|
|
#define CONV_MR 4
|
|
#define CONV_NR 12
|
|
enum { FAST_VEC_NLANES=4 };
|
|
#else // SIMD 128, AVX or AVX2
|
|
#define CONV_MR 4
|
|
#define CONV_NR 24
|
|
|
|
#if CV_TRY_AVX2
|
|
enum { FAST_VEC_NLANES=8 }; // AVX2
|
|
#else
|
|
enum { FAST_VEC_NLANES=4 }; // SIMD 128
|
|
#endif
|
|
|
|
#endif
|
|
#endif
|
|
|
|
enum {
|
|
_FX_WINO_STEP=6,
|
|
_FX_WINO_KSIZE=3,
|
|
_FX_WINO_SIZE=_FX_WINO_STEP+_FX_WINO_KSIZE-1,
|
|
_FX_WINO_AREA=_FX_WINO_SIZE*_FX_WINO_SIZE,
|
|
|
|
#if CV_TRY_AVX2 || (CV_NEON && CV_NEON_AARCH64)
|
|
_FX_WINO_KBLOCK = 4,
|
|
_FX_WINO_IBLOCK = 6,
|
|
#else
|
|
_FX_WINO_KBLOCK = 4,
|
|
_FX_WINO_IBLOCK = 3,
|
|
#endif
|
|
|
|
#if CV_TRY_AVX2
|
|
_FX_WINO_ATOM_F32 = 8,
|
|
#else
|
|
_FX_WINO_ATOM_F32 = 4,
|
|
#endif
|
|
|
|
_FX_WINO_NATOMS_F32 = _FX_WINO_AREA / _FX_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
|
|
};
|
|
|
|
enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2 };
|
|
|
|
namespace cv {
|
|
namespace dnn {
|
|
|
|
struct FastConv2d
|
|
{
|
|
int ngroups;
|
|
int K, C, Hk, Wk;
|
|
int stride_y, stride_x;
|
|
int dilation_y, dilation_x;
|
|
int pad_top, pad_bottom, pad_left, pad_right;
|
|
|
|
std::vector<float> weightsBuf; // For generic Conv 2D
|
|
float* weightsBufPtr;
|
|
std::vector<float> weightsWinoBuf; // For Winograd F(6x6, 3x3).
|
|
float* weightsWinoBufPtr;
|
|
std::vector<float> biasBuf;
|
|
int conv_type;
|
|
#if CV_SIMD128
|
|
bool useSIMD128 = true;
|
|
#else
|
|
bool useSIMD128 = false;
|
|
#endif
|
|
bool useAVX2 = checkHardwareSupport(CPU_AVX2);
|
|
bool useNEON = checkHardwareSupport(CPU_NEON);
|
|
};
|
|
|
|
// return a FastConv2d instance.
|
|
Ptr<FastConv2d> initFastConv2d(
|
|
int ngroups,
|
|
int K, int C, int Hk, int Wk,
|
|
int stride_x, int stride_y,
|
|
int dilation_x, int dilation_y,
|
|
const std::vector<size_t>& pads_begin,
|
|
const std::vector<size_t>& pads_end,
|
|
InputArray weightsMat,
|
|
float* srcBias, bool useWinograd);
|
|
|
|
// It contains different computing branches, like winograd, 1x1 conv.
|
|
void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
|
|
const Ptr<ActivationLayer>& actLayer, bool fusedAdd);
|
|
|
|
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval,
|
|
ActivationLayer* activ, bool ifMinMaxAct);
|
|
|
|
void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
|
|
float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
|
|
|
|
} // namespace dnn
|
|
|
|
namespace opt_AVX2
|
|
{
|
|
#if CV_TRY_AVX2
|
|
void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c);
|
|
|
|
void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab,
|
|
float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left,
|
|
int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
|
|
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3);
|
|
|
|
void _fx_winograd_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock);
|
|
void _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, float* outptr, int Cg);
|
|
void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
|
|
float bias, float minval, float maxval, bool ifMinMaxAct);
|
|
|
|
#endif
|
|
} // namespace opt_AVX2
|
|
|
|
} // namespace cv
|
|
|
|
#endif //OPENCV_FAST_CONVOLUTION_HPP
|