mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 09:25:45 +08:00
Merge pull request #22179 from hanliutong:new-rvv
[GSoC] New universal intrinsic backend for RVV * Add new rvv backend (partially implemented). * Modify the framework of Universal Intrinsic. * Add CV_SIMD macro guards to current UI code. * Use vlanes() instead of nlanes. * Modify the UI test. * Enable the new RVV (scalable) backend. * Remove whitespace. * Rename and some others modify. * Update intrin.hpp but still not work on AVX/SSE * Update conditional compilation macros. * Use static variable for vlanes. * Use max_nlanes for array defining.
This commit is contained in:
parent
3c23a44786
commit
0ef803950b
@ -177,7 +177,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
|
||||
{
|
||||
int x, c, width = img1.cols, cn = img1.channels();
|
||||
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
|
||||
int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1;
|
||||
int D = (int)alignSize(maxD - minD, VTraits<v_int16>::vlanes()), width1 = maxX1 - minX1;
|
||||
//This minX1 & maxX2 correction is defining which part of calculatable line must be calculated
|
||||
//That is needs of parallel algorithm
|
||||
xrange_min = (xrange_min < 0) ? 0: xrange_min;
|
||||
@ -502,8 +502,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
||||
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
|
||||
const int D = params.numDisparities;
|
||||
int width1 = maxX1 - minX1;
|
||||
int Da = (int)alignSize(D, v_int16::nlanes);
|
||||
int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
|
||||
int Da = (int)alignSize(D,VTraits<v_int16>::vlanes());
|
||||
int Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
|
||||
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
|
||||
int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2;
|
||||
int npasses = params.isFullDP() ? 2 : 1;
|
||||
@ -977,11 +977,10 @@ struct CalcVerticalSums: public ParallelLoopBody
|
||||
width = img1.cols;
|
||||
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
|
||||
D = maxD - minD;
|
||||
Da = (int)alignSize(D, v_int16::nlanes);
|
||||
Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
|
||||
Da = (int)alignSize(D, VTraits<v_int16>::vlanes());
|
||||
Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
|
||||
width1 = maxX1 - minX1;
|
||||
D = params.numDisparities;
|
||||
Da = (int)alignSize(D, v_int16::nlanes);
|
||||
}
|
||||
|
||||
void operator()(const Range& range) const CV_OVERRIDE
|
||||
@ -1235,8 +1234,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
|
||||
INVALID_DISP = minD - 1;
|
||||
INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
|
||||
D = maxD - minD;
|
||||
Da = (int)alignSize(D, v_int16::nlanes);
|
||||
Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
|
||||
Da = (int)alignSize(D, VTraits<v_int16>::vlanes());
|
||||
Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
|
||||
width1 = maxX1 - minX1;
|
||||
}
|
||||
|
||||
@ -1484,8 +1483,8 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
|
||||
int width = disp1.cols, height = disp1.rows;
|
||||
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
|
||||
int width1 = maxX1 - minX1;
|
||||
int Da = (int)alignSize(params.numDisparities, v_int16::nlanes);
|
||||
int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
|
||||
int Da = (int)alignSize(params.numDisparities, VTraits<v_int16>::vlanes());
|
||||
int Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
|
||||
int INVALID_DISP = minD - 1;
|
||||
int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
|
||||
|
||||
@ -1630,7 +1629,7 @@ SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
|
||||
width = img1->cols; height = img1->rows;
|
||||
minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
|
||||
minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
|
||||
Da = (int)alignSize(D, v_int16::nlanes);
|
||||
Da = (int)alignSize(D, VTraits<v_int16>::vlanes());
|
||||
|
||||
SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
|
||||
|
||||
|
@ -200,7 +200,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
# undef CV_RVV
|
||||
#endif
|
||||
|
||||
#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP)
|
||||
#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071) && !defined(CV_FORCE_SIMD128_CPP)
|
||||
#define CV__SIMD_FORWARD 128
|
||||
#include "opencv2/core/hal/intrin_forward.hpp"
|
||||
#endif
|
||||
@ -229,9 +229,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
|
||||
#elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP)
|
||||
#include "opencv2/core/hal/intrin_wasm.hpp"
|
||||
|
||||
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
|
||||
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_SCALABLE)
|
||||
#include "opencv2/core/hal/intrin_rvv.hpp"
|
||||
|
||||
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_SCALABLE
|
||||
#include "opencv2/core/hal/intrin_rvv_scalable.hpp"
|
||||
#else
|
||||
|
||||
#include "opencv2/core/hal/intrin_cpp.hpp"
|
||||
@ -314,6 +315,14 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
#define CV_SIMD512_FP16 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD_SCALABLE
|
||||
#define CV_SIMD_SCALABLE 0
|
||||
#endif
|
||||
|
||||
#ifndef CV_SIMD_SCALABLE_64F
|
||||
#define CV_SIMD_SCALABLE_64F 0
|
||||
#endif
|
||||
|
||||
//==================================================================================================
|
||||
|
||||
template<typename _Tp> struct V_RegTraits
|
||||
@ -375,6 +384,18 @@ template<typename _Tp> struct V_RegTraits
|
||||
CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
|
||||
CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
|
||||
#endif
|
||||
#if CV_SIMD_SCALABLE
|
||||
CV_DEF_REG_TRAITS(v, v_uint8, uchar, u8, v_uint8, v_uint16, v_uint32, v_int8, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void);
|
||||
CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void);
|
||||
CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void);
|
||||
CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32);
|
||||
CV_DEF_REG_TRAITS(v, v_uint64, uint64, u64, v_uint64, void, void, v_int64, void);
|
||||
CV_DEF_REG_TRAITS(v, v_int64, int64, s64, v_uint64, void, void, v_int64, void);
|
||||
CV_DEF_REG_TRAITS(v, v_float64, double, f64, v_float64, void, void, v_int64, v_int32);
|
||||
#endif
|
||||
//! @endcond
|
||||
|
||||
#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
|
||||
@ -488,6 +509,17 @@ namespace CV__SIMD_NAMESPACE {
|
||||
#define VXPREFIX(func) v##func
|
||||
} // namespace
|
||||
using namespace CV__SIMD_NAMESPACE;
|
||||
|
||||
#elif CV_SIMD_SCALABLE
|
||||
#define CV__SIMD_NAMESPACE simd
|
||||
namespace CV__SIMD_NAMESPACE {
|
||||
#define CV_SIMD 0
|
||||
#define CV_SIMD_WIDTH 128 /* 1024/8 */
|
||||
|
||||
#define VXPREFIX(func) v##func
|
||||
} // namespace
|
||||
using namespace CV__SIMD_NAMESPACE;
|
||||
|
||||
#endif
|
||||
|
||||
namespace CV__SIMD_NAMESPACE {
|
||||
@ -663,6 +695,402 @@ namespace CV__SIMD_NAMESPACE {
|
||||
/** @brief SIMD processing state cleanup call */
|
||||
inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
|
||||
|
||||
#if CV_SIMD
|
||||
// Compatibility layer
|
||||
#define CV_SIMD_SCALABLE 0
|
||||
#define CV_SIMD_SCALABLE_64F 0
|
||||
|
||||
template <class T>
|
||||
struct VTraits;
|
||||
#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
|
||||
template <>
|
||||
struct VTraits<v_uint8>
|
||||
{
|
||||
static inline int vlanes() { return v_uint8::nlanes; }
|
||||
enum { nlanes = 64, max_nlanes = nlanes };
|
||||
using lane_type = uchar;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int8>
|
||||
{
|
||||
static inline int vlanes() { return v_int8::nlanes; }
|
||||
enum { nlanes = 64, max_nlanes = nlanes };
|
||||
using lane_type = schar;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_uint16>
|
||||
{
|
||||
static inline int vlanes() { return v_uint16::nlanes; }
|
||||
enum { nlanes = 32, max_nlanes = nlanes };
|
||||
using lane_type = ushort;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int16>
|
||||
{
|
||||
static inline int vlanes() { return v_int16::nlanes; }
|
||||
enum { nlanes = 32, max_nlanes = nlanes };
|
||||
using lane_type = short;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_uint32>
|
||||
{
|
||||
static inline int vlanes() { return v_uint32::nlanes; }
|
||||
enum { nlanes = 16, max_nlanes = nlanes };
|
||||
using lane_type = uint;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int32>
|
||||
{
|
||||
static inline int vlanes() { return v_int32::nlanes; }
|
||||
enum { nlanes = 16, max_nlanes = nlanes };
|
||||
using lane_type = int;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VTraits<v_float32>
|
||||
{
|
||||
static inline int vlanes() { return v_float32::nlanes; }
|
||||
enum { nlanes = 16, max_nlanes = nlanes };
|
||||
using lane_type = float;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_uint64>
|
||||
{
|
||||
static inline int vlanes() { return v_uint64::nlanes; }
|
||||
enum { nlanes = 8, max_nlanes = nlanes };
|
||||
using lane_type = uint64;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int64>
|
||||
{
|
||||
static inline int vlanes() { return v_int64::nlanes; }
|
||||
enum { nlanes = 8, max_nlanes = nlanes };
|
||||
using lane_type = int64;
|
||||
};
|
||||
#if CV_SIMD_64F
|
||||
template <>
|
||||
struct VTraits<v_float64>
|
||||
{
|
||||
static inline int vlanes() { return v_float64::nlanes; }
|
||||
enum { nlanes = 8, max_nlanes = nlanes };
|
||||
using lane_type = double;
|
||||
};
|
||||
#endif
|
||||
#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
|
||||
template <>
|
||||
struct VTraits<v_uint8>
|
||||
{
|
||||
static inline int vlanes() { return v_uint8::nlanes; }
|
||||
enum { nlanes = 32, max_nlanes = nlanes };
|
||||
using lane_type = uchar;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int8>
|
||||
{
|
||||
static inline int vlanes() { return v_int8::nlanes; }
|
||||
enum { nlanes = 32, max_nlanes = nlanes };
|
||||
using lane_type = schar;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_uint16>
|
||||
{
|
||||
static inline int vlanes() { return v_uint16::nlanes; }
|
||||
enum { nlanes = 16, max_nlanes = nlanes };
|
||||
using lane_type = ushort;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int16>
|
||||
{
|
||||
static inline int vlanes() { return v_int16::nlanes; }
|
||||
enum { nlanes = 16, max_nlanes = nlanes };
|
||||
using lane_type = short;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_uint32>
|
||||
{
|
||||
static inline int vlanes() { return v_uint32::nlanes; }
|
||||
enum { nlanes = 8, max_nlanes = nlanes };
|
||||
using lane_type = uint;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int32>
|
||||
{
|
||||
static inline int vlanes() { return v_int32::nlanes; }
|
||||
enum { nlanes = 8, max_nlanes = nlanes };
|
||||
using lane_type = int;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VTraits<v_float32>
|
||||
{
|
||||
static inline int vlanes() { return v_float32::nlanes; }
|
||||
enum { nlanes = 8, max_nlanes = nlanes };
|
||||
using lane_type = float;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_uint64>
|
||||
{
|
||||
static inline int vlanes() { return v_uint64::nlanes; }
|
||||
enum { nlanes = 4, max_nlanes = nlanes };
|
||||
using lane_type = uint64;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int64>
|
||||
{
|
||||
static inline int vlanes() { return v_int64::nlanes; }
|
||||
enum { nlanes = 4, max_nlanes = nlanes };
|
||||
using lane_type = int64;
|
||||
};
|
||||
#if CV_SIMD_64F
|
||||
template <>
|
||||
struct VTraits<v_float64>
|
||||
{
|
||||
static inline int vlanes() { return v_float64::nlanes; }
|
||||
enum { nlanes = 4, max_nlanes = nlanes };
|
||||
using lane_type = double;
|
||||
};
|
||||
#endif
|
||||
#elif CV_SIMD128 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
|
||||
template <>
|
||||
struct VTraits<v_uint8>
|
||||
{
|
||||
static inline int vlanes() { return v_uint8::nlanes; }
|
||||
enum { nlanes = 16, max_nlanes = nlanes };
|
||||
using lane_type = uchar;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int8>
|
||||
{
|
||||
static inline int vlanes() { return v_int8::nlanes; }
|
||||
enum { nlanes = 16, max_nlanes = nlanes };
|
||||
using lane_type = schar;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_uint16>
|
||||
{
|
||||
static inline int vlanes() { return v_uint16::nlanes; }
|
||||
enum { nlanes = 8, max_nlanes = nlanes };
|
||||
using lane_type = ushort;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int16>
|
||||
{
|
||||
static inline int vlanes() { return v_int16::nlanes; }
|
||||
enum { nlanes = 8, max_nlanes = nlanes };
|
||||
using lane_type = short;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_uint32>
|
||||
{
|
||||
static inline int vlanes() { return v_uint32::nlanes; }
|
||||
enum { nlanes = 4, max_nlanes = nlanes };
|
||||
using lane_type = uint;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int32>
|
||||
{
|
||||
static inline int vlanes() { return v_int32::nlanes; }
|
||||
enum { nlanes = 4, max_nlanes = nlanes };
|
||||
using lane_type = int;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VTraits<v_float32>
|
||||
{
|
||||
static inline int vlanes() { return v_float32::nlanes; }
|
||||
enum { nlanes = 4, max_nlanes = nlanes };
|
||||
using lane_type = float;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_uint64>
|
||||
{
|
||||
static inline int vlanes() { return v_uint64::nlanes; }
|
||||
enum { nlanes = 2, max_nlanes = nlanes };
|
||||
using lane_type = uint64;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int64>
|
||||
{
|
||||
static inline int vlanes() { return v_int64::nlanes; }
|
||||
enum { nlanes = 2, max_nlanes = nlanes };
|
||||
using lane_type = int64;
|
||||
};
|
||||
#if CV_SIMD_64F
|
||||
template <>
|
||||
struct VTraits<v_float64>
|
||||
{
|
||||
static inline int vlanes() { return v_float64::nlanes; }
|
||||
enum { nlanes = 2, max_nlanes = nlanes };
|
||||
using lane_type = double;
|
||||
};
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
|
||||
inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a + b; \
|
||||
} \
|
||||
inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a - b; \
|
||||
} \
|
||||
template<typename... Args> \
|
||||
inline _Tpvec v_add(_Tpvec f1, _Tpvec f2, Args... vf) { \
|
||||
return v_add(f1 + f2, vf...); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
|
||||
inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a & b; \
|
||||
} \
|
||||
inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a | b; \
|
||||
} \
|
||||
inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a ^ b; \
|
||||
} \
|
||||
inline _Tpvec v_not(const _Tpvec& a) \
|
||||
{ \
|
||||
return ~a; \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
|
||||
|
||||
|
||||
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
|
||||
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a * b; \
|
||||
} \
|
||||
template<typename... Args> \
|
||||
inline _Tpvec v_mul(_Tpvec f1, _Tpvec f2, Args... vf) { \
|
||||
return v_mul(f1 * f2, vf...); \
|
||||
}
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
|
||||
#endif
|
||||
|
||||
|
||||
inline v_float32 v_div(const v_float32& a, const v_float32& b) \
|
||||
{ \
|
||||
return a / b; \
|
||||
}
|
||||
#if CV_SIMD_64F
|
||||
inline v_float64 v_div(const v_float64& a, const v_float64& b) \
|
||||
{ \
|
||||
return a / b; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
|
||||
inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return a op b; \
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_WRAP_CMP(_Tpvec) \
|
||||
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
|
||||
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \
|
||||
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \
|
||||
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \
|
||||
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \
|
||||
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=)
|
||||
|
||||
OPENCV_HAL_WRAP_CMP(v_uint8)
|
||||
OPENCV_HAL_WRAP_CMP(v_uint16)
|
||||
OPENCV_HAL_WRAP_CMP(v_uint32)
|
||||
// OPENCV_HAL_WRAP_CMP(v_uint64)
|
||||
OPENCV_HAL_WRAP_CMP(v_int8)
|
||||
OPENCV_HAL_WRAP_CMP(v_int16)
|
||||
OPENCV_HAL_WRAP_CMP(v_int32)
|
||||
// OPENCV_HAL_WRAP_CMP(v_int64)
|
||||
OPENCV_HAL_WRAP_CMP(v_float32)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_CMP(v_float64)
|
||||
#endif
|
||||
|
||||
//////////// get0 ////////////
|
||||
#define OPENCV_HAL_WRAP_GRT0_INT(_Tpvec, _Tp) \
|
||||
inline _Tp v_get0(v_##_Tpvec v) \
|
||||
{ \
|
||||
return v.get0(); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_GRT0_INT(uint8, uchar)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(int8, schar)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(uint16, ushort)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(int16, short)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(uint32, unsigned)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(int32, int)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(uint64, uint64)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(int64, int64)
|
||||
OPENCV_HAL_WRAP_GRT0_INT(float32, float)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_GRT0_INT(float64, double)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec, _Tp, vl) \
|
||||
inline _Tp v_extract_highest(_Tpvec v) \
|
||||
{ \
|
||||
return v_extract_n<vl-1>(v); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int8, schar, VTraits<v_int8>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int16, short, VTraits<v_int16>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int32, int, VTraits<v_int32>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_int64, int64, VTraits<v_int64>::nlanes)
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float32, float, VTraits<v_float32>::nlanes)
|
||||
#if CV_SIMD_64F
|
||||
OPENCV_HAL_WRAP_EXTRACT(v_float64, double, VTraits<v_float64>::nlanes)
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
|
||||
inline _Tpvec v_broadcast_highest(_Tpvec v) \
|
||||
{ \
|
||||
return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_uint32)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_int32)
|
||||
OPENCV_HAL_WRAP_BROADCAST(v_float32)
|
||||
|
||||
|
||||
#endif //CV_SIMD
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
|
493
modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
Normal file
493
modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
Normal file
@ -0,0 +1,493 @@
|
||||
|
||||
#ifndef OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
|
||||
#define OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
|
||||
|
||||
#include <initializer_list>
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
|
||||
#ifndef CV_RVV_MAX_VLEN
|
||||
#define CV_RVV_MAX_VLEN 1024
|
||||
#endif
|
||||
|
||||
namespace cv
|
||||
{
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||
|
||||
#define CV_SIMD_SCALABLE 1
|
||||
#define CV_SIMD_SCALABLE_64F 1
|
||||
|
||||
using v_uint8 = vuint8m1_t;
|
||||
using v_int8 = vint8m1_t;
|
||||
using v_uint16 = vuint16m1_t;
|
||||
using v_int16 = vint16m1_t;
|
||||
using v_uint32 = vuint32m1_t;
|
||||
using v_int32 = vint32m1_t;
|
||||
using v_uint64 = vuint64m1_t;
|
||||
using v_int64 = vint64m1_t;
|
||||
|
||||
using v_float32 = vfloat32m1_t;
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
using v_float64 = vfloat64m1_t;
|
||||
#endif
|
||||
|
||||
using uchar = unsigned char;
|
||||
using schar = signed char;
|
||||
using ushort = unsigned short;
|
||||
using uint = unsigned int;
|
||||
using uint64 = unsigned long int;
|
||||
using int64 = long int;
|
||||
|
||||
static const int __cv_rvv_e8_nlanes = vsetvlmax_e8m1();
|
||||
static const int __cv_rvv_e16_nlanes = vsetvlmax_e16m1();
|
||||
static const int __cv_rvv_e32_nlanes = vsetvlmax_e32m1();
|
||||
static const int __cv_rvv_e64_nlanes = vsetvlmax_e64m1();
|
||||
|
||||
template <class T>
|
||||
struct VTraits;
|
||||
|
||||
template <>
|
||||
struct VTraits<v_uint8>
|
||||
{
|
||||
static inline int vlanes() { return __cv_rvv_e8_nlanes; }
|
||||
using lane_type = uchar;
|
||||
static const int max_nlanes = CV_RVV_MAX_VLEN/8;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VTraits<v_int8>
|
||||
{
|
||||
static inline int vlanes() { return __cv_rvv_e8_nlanes; }
|
||||
using lane_type = schar;
|
||||
static const int max_nlanes = CV_RVV_MAX_VLEN/8;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_uint16>
|
||||
{
|
||||
static inline int vlanes() { return __cv_rvv_e16_nlanes; }
|
||||
using lane_type = ushort;
|
||||
static const int max_nlanes = CV_RVV_MAX_VLEN/16;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int16>
|
||||
{
|
||||
static inline int vlanes() { return __cv_rvv_e16_nlanes; }
|
||||
using lane_type = short;
|
||||
static const int max_nlanes = CV_RVV_MAX_VLEN/16;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_uint32>
|
||||
{
|
||||
static inline int vlanes() { return __cv_rvv_e32_nlanes; }
|
||||
using lane_type = uint;
|
||||
static const int max_nlanes = CV_RVV_MAX_VLEN/32;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int32>
|
||||
{
|
||||
static inline int vlanes() { return __cv_rvv_e32_nlanes; }
|
||||
using lane_type = int;
|
||||
static const int max_nlanes = CV_RVV_MAX_VLEN/32;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VTraits<v_float32>
|
||||
{
|
||||
static inline int vlanes() { return __cv_rvv_e32_nlanes; }
|
||||
using lane_type = float;
|
||||
static const int max_nlanes = CV_RVV_MAX_VLEN/32;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_uint64>
|
||||
{
|
||||
static inline int vlanes() { return __cv_rvv_e64_nlanes; }
|
||||
using lane_type = uint64;
|
||||
static const int max_nlanes = CV_RVV_MAX_VLEN/64;
|
||||
};
|
||||
template <>
|
||||
struct VTraits<v_int64>
|
||||
{
|
||||
static inline int vlanes() { return __cv_rvv_e64_nlanes; }
|
||||
using lane_type = int64;
|
||||
static const int max_nlanes = CV_RVV_MAX_VLEN/64;
|
||||
};
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
template <>
|
||||
struct VTraits<v_float64>
|
||||
{
|
||||
static inline int vlanes() { return __cv_rvv_e64_nlanes; }
|
||||
using lane_type = double;
|
||||
static const int max_nlanes = CV_RVV_MAX_VLEN/64;
|
||||
};
|
||||
#endif
|
||||
|
||||
//////////// get0 ////////////
|
||||
#define OPENCV_HAL_IMPL_RVV_GRT0_INT(_Tpvec, _Tp) \
|
||||
inline _Tp v_get0(v_##_Tpvec v) \
|
||||
{ \
|
||||
return vmv_x(v); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint8, uchar)
|
||||
OPENCV_HAL_IMPL_RVV_GRT0_INT(int8, schar)
|
||||
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint16, ushort)
|
||||
OPENCV_HAL_IMPL_RVV_GRT0_INT(int16, short)
|
||||
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint32, unsigned)
|
||||
OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int)
|
||||
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64)
|
||||
OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64)
|
||||
|
||||
inline float v_get0(v_float32 v) \
|
||||
{ \
|
||||
return vfmv_f(v); \
|
||||
}
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
inline double v_get0(v_float64 v) \
|
||||
{ \
|
||||
return vfmv_f(v); \
|
||||
}
|
||||
#endif
|
||||
|
||||
//////////// Initial ////////////
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
|
||||
inline v_##_Tpvec v_setzero_##suffix1() \
|
||||
{ \
|
||||
return vmv_v_x_##suffix2##m1(0, vl); \
|
||||
} \
|
||||
inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
|
||||
{ \
|
||||
return vmv_v_x_##suffix2##m1(v, vl); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8, uchar, u8, u8, VTraits<v_uint8>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8, schar, s8, i8, VTraits<v_int8>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16, ushort, u16, u16, VTraits<v_uint16>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16, short, s16, i16, VTraits<v_int16>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32, uint, u32, u32, VTraits<v_uint32>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32, int, s32, i32, VTraits<v_int32>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64, uint64, u64, u64, VTraits<v_uint64>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64, int64, s64, i64, VTraits<v_int64>::vlanes())
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
|
||||
inline v_##_Tpv v_setzero_##suffix() \
|
||||
{ \
|
||||
return vfmv_v_f_##suffix##m1(0, vl); \
|
||||
} \
|
||||
inline v_##_Tpv v_setall_##suffix(_Tp v) \
|
||||
{ \
|
||||
return vfmv_v_f_##suffix##m1(v, vl); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits<v_float32>::vlanes())
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits<v_float64>::vlanes())
|
||||
#endif
|
||||
|
||||
//////////// Reinterpret ////////////
|
||||
#define OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(_Tpvec1, suffix1) \
|
||||
inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec1& v) \
|
||||
{ \
|
||||
return v;\
|
||||
}
|
||||
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint8, u8)
|
||||
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint16, u16)
|
||||
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint32, u32)
|
||||
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint64, u64)
|
||||
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8)
|
||||
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16)
|
||||
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32)
|
||||
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64)
|
||||
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32)
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64)
|
||||
#endif
|
||||
// TODO: can be simplified by using overloaded RV intrinsic
|
||||
#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
|
||||
inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
|
||||
{ \
|
||||
return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
|
||||
} \
|
||||
inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
|
||||
{ \
|
||||
return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64)
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, float64, u64, f64, u64, f64)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64, float64, s64, f64, i64, f64)
|
||||
#endif
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint16, u8, u16, u8, u16)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint32, u8, u32, u8, u32)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint64, u8, u64, u8, u64)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint32, u16, u32, u16, u32)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint64, u16, u64, u16, u64)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, uint64, u32, u64, u32, u64)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int16, s8, s16, i8, i16)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int32, s8, s32, i8, i32)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int64, s8, s64, i8, i64)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int32, s16, s32, i16, i32)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int64, s16, s64, i16, i64)
|
||||
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, int64, s32, s64, i32, i64)
|
||||
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
|
||||
inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
|
||||
{ \
|
||||
return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v));\
|
||||
} \
|
||||
inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
|
||||
{ \
|
||||
return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v));\
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int16, u8, s16, u, i, 8, 16)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int32, u8, s32, u, i, 8, 32)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int64, u8, s64, u, i, 8, 64)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int8, u16, s8, u, i, 16, 8)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int32, u16, s32, u, i, 16, 32)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int64, u16, s64, u, i, 16, 64)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int8, u32, s8, u, i, 32, 8)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int16, u32, s16, u, i, 32, 16)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float32, s8, f32, i, f, 8, 32)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float32, s16, f32, i, f, 16, 32)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float32, s64, f32, i, f, 64, 32)
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float64, u8, f64, u, f, 8, 64)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float64, u16, f64, u, f, 16, 64)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float64, u32, f64, u, f, 32, 64)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64)
|
||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64)
|
||||
// Three times reinterpret
|
||||
inline v_float32 v_reinterpret_as_f32(const v_float64& v) \
|
||||
{ \
|
||||
return vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v)));\
|
||||
}
|
||||
|
||||
inline v_float64 v_reinterpret_as_f64(const v_float32& v) \
|
||||
{ \
|
||||
return vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v)));\
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
////////////// Load/Store //////////////
|
||||
#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
|
||||
inline _Tpvec v_load(const _Tp* ptr) \
|
||||
{ \
|
||||
return vle##width##_v_##suffix##m1(ptr, vl); \
|
||||
} \
|
||||
inline _Tpvec v_load_aligned(const _Tp* ptr) \
|
||||
{ \
|
||||
return vle##width##_v_##suffix##m1(ptr, vl); \
|
||||
} \
|
||||
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
|
||||
{ \
|
||||
vse##width##_v_##suffix##m1(ptr, a, vl); \
|
||||
} \
|
||||
inline _Tpvec v_load_low(const _Tp* ptr) \
|
||||
{ \
|
||||
return vle##width##_v_##suffix##m1(ptr, hvl); \
|
||||
} \
|
||||
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
|
||||
{ \
|
||||
return vslideup(vle##width##_v_##suffix##m1(ptr0, hvl), vle##width##_v_##suffix##m1(ptr1, hvl), hvl, vl); \
|
||||
} \
|
||||
inline void v_store(_Tp* ptr, const _Tpvec& a) \
|
||||
{ \
|
||||
vse##width(ptr, a, vl); \
|
||||
} \
|
||||
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
|
||||
{ \
|
||||
vse##width(ptr, a, vl); \
|
||||
} \
|
||||
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
|
||||
{ \
|
||||
vse##width(ptr, a, vl); \
|
||||
} \
|
||||
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
|
||||
{ \
|
||||
vse##width(ptr, a, hvl); \
|
||||
} \
|
||||
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
|
||||
{ \
|
||||
vse##width(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
|
||||
} \
|
||||
inline _Tpvec v_load(std::initializer_list<_Tp> nScalars) \
|
||||
{ \
|
||||
assert(nScalars.size() == vl); \
|
||||
return vle##width##_v_##suffix##m1(nScalars.begin(), nScalars.size()); \
|
||||
} \
|
||||
template<typename... Targs> \
|
||||
_Tpvec v_load_##suffix(Targs... nScalars) \
|
||||
{ \
|
||||
return v_load({nScalars...}); \
|
||||
}
|
||||
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits<v_uint8>::vlanes() / 2, VTraits<v_uint8>::vlanes(), 8, u8, vmv_v_x_u8m1)
|
||||
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits<v_int8>::vlanes() / 2, VTraits<v_int8>::vlanes(), 8, i8, vmv_v_x_i8m1)
|
||||
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16, vuint16m1_t, ushort, VTraits<v_uint16>::vlanes() / 2, VTraits<v_uint16>::vlanes(), 16, u16, vmv_v_x_u16m1)
|
||||
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16, vint16m1_t, short, VTraits<v_int16>::vlanes() / 2, VTraits<v_int16>::vlanes(), 16, i16, vmv_v_x_i16m1)
|
||||
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits<v_uint32>::vlanes() / 2, VTraits<v_uint32>::vlanes(), 32, u32, vmv_v_x_u32m1)
|
||||
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m1_t, int, VTraits<v_int32>::vlanes() / 2, VTraits<v_int32>::vlanes(), 32, i32, vmv_v_x_i32m1)
|
||||
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits<v_uint64>::vlanes() / 2, VTraits<v_uint64>::vlanes(), 64, u64, vmv_v_x_u64m1)
|
||||
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits<v_int64>::vlanes() / 2, VTraits<v_int64>::vlanes(), 64, i64, vmv_v_x_i64m1)
|
||||
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits<v_float32>::vlanes() /2 , VTraits<v_float32>::vlanes(), 32, f32, vfmv_v_f_f32m1)
|
||||
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits<v_float64>::vlanes() / 2, VTraits<v_float64>::vlanes(), 64, f64, vfmv_v_f_f64m1)
|
||||
#endif
|
||||
|
||||
////////////// Lookup table access ////////////////////
|
||||
#define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
|
||||
inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
|
||||
{ \
|
||||
vuint32##suffix##_t vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
|
||||
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
|
||||
} \
|
||||
inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
|
||||
{ \
|
||||
std::vector<uint> idx_; \
|
||||
for (size_t i = 0; i < VTraits<v_int16>::vlanes(); ++i) { \
|
||||
idx_.push_back(idx[i]); \
|
||||
idx_.push_back(idx[i]+1); \
|
||||
} \
|
||||
vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
|
||||
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
|
||||
} \
|
||||
inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
|
||||
{ \
|
||||
std::vector<uint> idx_; \
|
||||
for (size_t i = 0; i < VTraits<v_int32>::vlanes(); ++i) { \
|
||||
idx_.push_back(idx[i]); \
|
||||
idx_.push_back(idx[i]+1); \
|
||||
idx_.push_back(idx[i]+2); \
|
||||
idx_.push_back(idx[i]+3); \
|
||||
} \
|
||||
vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
|
||||
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
|
||||
}
|
||||
OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4)
|
||||
OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2)
|
||||
OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1)
|
||||
OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2)
|
||||
OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
|
||||
|
||||
inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
|
||||
inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
|
||||
inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
|
||||
inline v_uint16 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
|
||||
inline v_uint16 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
|
||||
inline v_uint16 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
|
||||
inline v_uint32 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
|
||||
inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
|
||||
inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
|
||||
inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
|
||||
inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
|
||||
inline v_uint64 v_lut_quads(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_quads((const int64_t*)tab, idx)); }
|
||||
|
||||
|
||||
////////////// Min/Max //////////////
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
|
||||
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ \
|
||||
return intrin(a, b, vl); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_min, vminu, VTraits<v_uint8>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_max, vmaxu, VTraits<v_uint8>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_min, vmin, VTraits<v_int8>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_max, vmax, VTraits<v_int8>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_min, vminu, VTraits<v_uint16>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_max, vmaxu, VTraits<v_uint16>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_min, vmin, VTraits<v_int16>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_max, vmax, VTraits<v_int16>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, vminu, VTraits<v_uint32>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, vmaxu, VTraits<v_uint32>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, vmin, VTraits<v_int32>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, vmax, VTraits<v_int32>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, vfmin, VTraits<v_float32>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, vfmax, VTraits<v_float32>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_min, vminu, VTraits<v_uint64>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_max, vmaxu, VTraits<v_uint64>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_min, vmin, VTraits<v_int64>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_max, vmax, VTraits<v_int64>::vlanes())
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, vfmin, VTraits<v_float64>::vlanes())
|
||||
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, vfmax, VTraits<v_float64>::vlanes())
|
||||
#endif
|
||||
|
||||
|
||||
//////////// Value reordering ////////////
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tp, _Tpwvec, _Tpwvec_m2, _Tpvec, width, suffix, suffix2, cvt) \
|
||||
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
|
||||
{ \
|
||||
_Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
|
||||
b0 = vget_##suffix##m1(temp, 0); \
|
||||
b1 = vget_##suffix##m1(temp, 1); \
|
||||
} \
|
||||
inline _Tpwvec v_expand_low(const _Tpvec& a) \
|
||||
{ \
|
||||
_Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
|
||||
return vget_##suffix##m1(temp, 0); \
|
||||
} \
|
||||
inline _Tpwvec v_expand_high(const _Tpvec& a) \
|
||||
{ \
|
||||
_Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
|
||||
return vget_##suffix##m1(temp, 1); \
|
||||
} \
|
||||
inline _Tpwvec v_load_expand(const _Tp* ptr) \
|
||||
{ \
|
||||
return cvt(vle##width##_v_##suffix2##mf2(ptr, vsetvlmax_e##width##m1()), vsetvlmax_e##width##m1()); \
|
||||
}
|
||||
|
||||
OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m2_t, v_uint8, 8, u16, u8, vwcvtu_x)
|
||||
OPENCV_HAL_IMPL_RVV_EXPAND(schar, v_int16, vint16m2_t, v_int8, 8, i16, i8, vwcvt_x)
|
||||
OPENCV_HAL_IMPL_RVV_EXPAND(ushort, v_uint32, vuint32m2_t, v_uint16, 16, u32, u16, vwcvtu_x)
|
||||
OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m2_t, v_int16, 16, i32, i16, vwcvt_x)
|
||||
OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m2_t, v_uint32, 32, u64, u32, vwcvtu_x)
|
||||
OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m2_t, v_int32, 32, i64, i32, vwcvt_x)
|
||||
|
||||
inline v_uint32 v_load_expand_q(const uchar* ptr)
|
||||
{
|
||||
return vwcvtu_x(vwcvtu_x(vle8_v_u8mf4(ptr, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
|
||||
}
|
||||
|
||||
inline v_int32 v_load_expand_q(const schar* ptr)
|
||||
{
|
||||
return vwcvt_x(vwcvt_x(vle8_v_i8mf4(ptr, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
|
||||
}
|
||||
|
||||
|
||||
////// FP16 support ///////
|
||||
|
||||
inline v_float32 v_load_expand(const float16_t* ptr)
|
||||
{
|
||||
// TODO
|
||||
return vundefined_f32m1();
|
||||
}
|
||||
|
||||
inline void v_cleanup() {}
|
||||
|
||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
||||
|
||||
} //namespace cv
|
||||
|
||||
#endif //OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
|
@ -128,8 +128,48 @@ template<> inline Type2Vec512_Traits<double>::vec_type v512_setall<double>(const
|
||||
|
||||
#endif // SIMD512
|
||||
|
||||
#if CV_SIMD_SCALABLE
|
||||
template<typename _T> struct Type2Vec_Traits;
|
||||
#define CV_INTRIN_DEF_TYPE2VEC_TRAITS(type_, vec_type_) \
|
||||
template<> struct Type2Vec_Traits<type_> \
|
||||
{ \
|
||||
typedef vec_type_ vec_type; \
|
||||
}
|
||||
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(uchar, v_uint8);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(schar, v_int8);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(ushort, v_uint16);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(short, v_int16);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(unsigned, v_uint32);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(int, v_int32);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(float, v_float32);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(uint64, v_uint64);
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(int64, v_int64);
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
CV_INTRIN_DEF_TYPE2VEC_TRAITS(double, v_float64);
|
||||
#endif
|
||||
template<typename _T> static inline
|
||||
typename Type2Vec_Traits<_T>::vec_type v_setall(const _T& a);
|
||||
|
||||
template<> inline Type2Vec_Traits< uchar>::vec_type v_setall< uchar>(const uchar& a) { return v_setall_u8(a); }
|
||||
template<> inline Type2Vec_Traits< schar>::vec_type v_setall< schar>(const schar& a) { return v_setall_s8(a); }
|
||||
template<> inline Type2Vec_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
|
||||
template<> inline Type2Vec_Traits< short>::vec_type v_setall< short>(const short& a) { return v_setall_s16(a); }
|
||||
template<> inline Type2Vec_Traits< uint>::vec_type v_setall< uint>(const uint& a) { return v_setall_u32(a); }
|
||||
template<> inline Type2Vec_Traits< int>::vec_type v_setall< int>(const int& a) { return v_setall_s32(a); }
|
||||
template<> inline Type2Vec_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
|
||||
template<> inline Type2Vec_Traits< int64>::vec_type v_setall< int64>(const int64& a) { return v_setall_s64(a); }
|
||||
template<> inline Type2Vec_Traits< float>::vec_type v_setall< float>(const float& a) { return v_setall_f32(a); }
|
||||
#if CV_SIMD_SCALABLE_64F
|
||||
template<> inline Type2Vec_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if CV_SIMD_SCALABLE
|
||||
template<typename _T> static inline
|
||||
typename Type2Vec_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
|
||||
#elif CV_SIMD_WIDTH == 16
|
||||
template<typename _T> static inline
|
||||
typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
|
||||
#elif CV_SIMD_WIDTH == 32
|
||||
|
@ -266,24 +266,30 @@ struct op_absdiff
|
||||
template<>
|
||||
struct op_absdiff<schar, v_int8>
|
||||
{
|
||||
#if CV_SIMD
|
||||
static inline v_int8 r(const v_int8& a, const v_int8& b)
|
||||
{ return v_absdiffs(a, b); }
|
||||
#endif
|
||||
static inline schar r(schar a, schar b)
|
||||
{ return c_absdiff(a, b); }
|
||||
};
|
||||
template<>
|
||||
struct op_absdiff<short, v_int16>
|
||||
{
|
||||
#if CV_SIMD
|
||||
static inline v_int16 r(const v_int16& a, const v_int16& b)
|
||||
{ return v_absdiffs(a, b); }
|
||||
#endif
|
||||
static inline short r(short a, short b)
|
||||
{ return c_absdiff(a, b); }
|
||||
};
|
||||
template<>
|
||||
struct op_absdiff<int, v_int32>
|
||||
{
|
||||
#if CV_SIMD
|
||||
static inline v_int32 r(const v_int32& a, const v_int32& b)
|
||||
{ return v_reinterpret_as_s32(v_absdiff(a, b)); }
|
||||
#endif
|
||||
static inline int r(int a, int b)
|
||||
{ return c_absdiff(a, b); }
|
||||
};
|
||||
@ -1430,11 +1436,13 @@ struct op_mul
|
||||
template<typename T1, typename T2, typename Tvec>
|
||||
struct op_mul_scale
|
||||
{
|
||||
#if CV_SIMD
|
||||
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
|
||||
{
|
||||
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
||||
return v_scalar * a * b;
|
||||
}
|
||||
#endif
|
||||
static inline T1 r(T1 a, T1 b, const T2* scalar)
|
||||
{ return c_mul(a, b, *scalar); }
|
||||
static inline Tvec pre(const Tvec&, const Tvec& res)
|
||||
@ -1569,6 +1577,7 @@ struct op_div_f
|
||||
template<typename T1, typename T2, typename Tvec>
|
||||
struct op_div_scale
|
||||
{
|
||||
#if CV_SIMD
|
||||
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
|
||||
{
|
||||
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
||||
@ -1579,6 +1588,7 @@ struct op_div_scale
|
||||
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
|
||||
return v_select(denom == v_zero, v_zero, res);
|
||||
}
|
||||
#endif
|
||||
static inline T1 r(T1 a, T1 denom, const T2* scalar)
|
||||
{
|
||||
CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
|
||||
@ -1589,11 +1599,13 @@ struct op_div_scale
|
||||
template<>
|
||||
struct op_div_scale<float, float, v_float32>
|
||||
{
|
||||
#if CV_SIMD
|
||||
static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
|
||||
{
|
||||
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
||||
return a * v_scalar / b;
|
||||
}
|
||||
#endif
|
||||
static inline float r(float a, float denom, const float* scalar)
|
||||
{ return c_div(a, denom, *scalar); }
|
||||
};
|
||||
@ -1673,11 +1685,13 @@ DEFINE_SIMD_ALL(div, div_loop)
|
||||
template<typename T1, typename T2, typename Tvec>
|
||||
struct op_add_scale
|
||||
{
|
||||
#if CV_SIMD
|
||||
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
|
||||
{
|
||||
const v_float32 v_alpha = vx_setall_f32(*scalar);
|
||||
return v_fma(a, v_alpha, b);
|
||||
}
|
||||
#endif
|
||||
static inline T1 r(T1 a, T1 b, const T2* scalar)
|
||||
{ return c_add(a, b, *scalar); }
|
||||
static inline Tvec pre(const Tvec&, const Tvec& res)
|
||||
@ -1704,6 +1718,7 @@ struct op_add_scale<double, double, v_float64>
|
||||
template<typename T1, typename T2, typename Tvec>
|
||||
struct op_add_weighted
|
||||
{
|
||||
#if CV_SIMD
|
||||
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
|
||||
{
|
||||
const v_float32 v_alpha = vx_setall_f32(scalars[0]);
|
||||
@ -1711,6 +1726,7 @@ struct op_add_weighted
|
||||
const v_float32 v_gamma = vx_setall_f32(scalars[2]);
|
||||
return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma));
|
||||
}
|
||||
#endif
|
||||
static inline T1 r(T1 a, T1 b, const T2* scalars)
|
||||
{ return c_add(a, b, scalars[0], scalars[1], scalars[2]); }
|
||||
static inline Tvec pre(const Tvec&, const Tvec& res)
|
||||
@ -1819,6 +1835,7 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
|
||||
template<typename T1, typename T2, typename Tvec>
|
||||
struct op_recip
|
||||
{
|
||||
#if CV_SIMD
|
||||
static inline v_float32 r(const v_float32& a, const T2* scalar)
|
||||
{
|
||||
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
||||
@ -1829,6 +1846,7 @@ struct op_recip
|
||||
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
|
||||
return v_select(denom == v_zero, v_zero, res);
|
||||
}
|
||||
#endif
|
||||
static inline T1 r(T1 denom, const T2* scalar)
|
||||
{
|
||||
CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
|
||||
@ -1839,11 +1857,13 @@ struct op_recip
|
||||
template<>
|
||||
struct op_recip<float, float, v_float32>
|
||||
{
|
||||
#if CV_SIMD
|
||||
static inline v_float32 r(const v_float32& a, const float* scalar)
|
||||
{
|
||||
const v_float32 v_scalar = vx_setall_f32(*scalar);
|
||||
return v_scalar / a;
|
||||
}
|
||||
#endif
|
||||
static inline float r(float denom, const float* scalar)
|
||||
{ return c_div(*scalar, denom); }
|
||||
};
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include "opencv2/core/hal/intrin.hpp"
|
||||
#undef CV__SIMD_FORCE_WIDTH
|
||||
|
||||
#if CV_SIMD_WIDTH != 16
|
||||
#if CV_SIMD && CV_SIMD_WIDTH != 16
|
||||
#error "Invalid build configuration"
|
||||
#endif
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -6,6 +6,8 @@
|
||||
|
||||
#if !defined(GAPI_STANDALONE)
|
||||
|
||||
#include <opencv2/core/hal/intrin.hpp>
|
||||
#if CV_SIMD
|
||||
#include "gfluidcore_func.hpp"
|
||||
#include "gfluidcore_func.simd.hpp"
|
||||
|
||||
@ -14,7 +16,6 @@
|
||||
#include "gfluidutils.hpp"
|
||||
|
||||
#include <opencv2/core/cvdef.h>
|
||||
#include <opencv2/core/hal/intrin.hpp>
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
@ -394,5 +395,5 @@ CONVERTTO_SCALED_SIMD(float, float)
|
||||
} // namespace fluid
|
||||
} // namespace gapi
|
||||
} // namespace cv
|
||||
|
||||
#endif // CV_SIMD
|
||||
#endif // !defined(GAPI_STANDALONE)
|
||||
|
@ -6,7 +6,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#if !defined(GAPI_STANDALONE)
|
||||
#if !defined(GAPI_STANDALONE) && CV_SIMD
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
|
@ -3612,6 +3612,7 @@ struct Luv2RGBinteger
|
||||
}
|
||||
}
|
||||
|
||||
#if CV_SIMD
|
||||
inline void processLuvToXYZ(const v_uint8& lv, const v_uint8& uv, const v_uint8& vv,
|
||||
v_int32 (&x)[4], v_int32 (&y)[4], v_int32 (&z)[4]) const
|
||||
{
|
||||
@ -3717,6 +3718,7 @@ struct Luv2RGBinteger
|
||||
z[k] = v_max(zero, v_min(base2, z[k]));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void operator()(const uchar* src, uchar* dst, int n) const
|
||||
{
|
||||
|
@ -1038,6 +1038,7 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i
|
||||
buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
|
||||
}
|
||||
|
||||
#if CV_SIMD
|
||||
static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
|
||||
v_int32 (&ruv)[4],
|
||||
v_int32 (&guv)[4],
|
||||
@ -1067,6 +1068,7 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
|
||||
buv[k] = vshift + ub * uu[k];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, const int buv,
|
||||
uchar& r, uchar& g, uchar& b, uchar& a)
|
||||
@ -1079,6 +1081,7 @@ static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, co
|
||||
a = uchar(0xff);
|
||||
}
|
||||
|
||||
#if CV_SIMD
|
||||
static inline void yRGBuvToRGBA(const v_uint8& vy,
|
||||
const v_int32 (&ruv)[4],
|
||||
const v_int32 (&guv)[4],
|
||||
@ -1117,6 +1120,7 @@ static inline void yRGBuvToRGBA(const v_uint8& vy,
|
||||
gg = v_pack_u(g0, g1);
|
||||
bb = v_pack_u(b0, b1);
|
||||
}
|
||||
#endif
|
||||
|
||||
template<int bIdx, int dcn, bool is420>
|
||||
static inline void cvtYuv42xxp2RGB8(const uchar u, const uchar v,
|
||||
@ -1426,6 +1430,7 @@ static inline uchar rgbToY42x(uchar r, uchar g, uchar b)
|
||||
return saturate_cast<uchar>(yy >> ITUR_BT_601_SHIFT);
|
||||
}
|
||||
|
||||
#if CV_SIMD
|
||||
static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b)
|
||||
{
|
||||
const int shifted16 = (16 << ITUR_BT_601_SHIFT);
|
||||
@ -1455,6 +1460,7 @@ static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint
|
||||
|
||||
return v_pack(y0, y1);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
|
||||
{
|
||||
@ -1467,6 +1473,7 @@ static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
|
||||
v = saturate_cast<uchar>(vv >> ITUR_BT_601_SHIFT);
|
||||
}
|
||||
|
||||
#if CV_SIMD
|
||||
static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1,
|
||||
const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v)
|
||||
{
|
||||
@ -1514,6 +1521,7 @@ static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint
|
||||
u = v_pack_u(u0, u1);
|
||||
v = v_pack_u(v0, v1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
struct RGB8toYUV420pInvoker: public ParallelLoopBody
|
||||
|
@ -497,7 +497,6 @@ struct MinMax8u
|
||||
{
|
||||
typedef uchar value_type;
|
||||
typedef int arg_type;
|
||||
enum { SIZE = 1 };
|
||||
arg_type load(const uchar* ptr) { return *ptr; }
|
||||
void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; }
|
||||
void operator()(arg_type& a, arg_type& b) const
|
||||
@ -511,7 +510,6 @@ struct MinMax16u
|
||||
{
|
||||
typedef ushort value_type;
|
||||
typedef int arg_type;
|
||||
enum { SIZE = 1 };
|
||||
arg_type load(const ushort* ptr) { return *ptr; }
|
||||
void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; }
|
||||
void operator()(arg_type& a, arg_type& b) const
|
||||
@ -526,7 +524,6 @@ struct MinMax16s
|
||||
{
|
||||
typedef short value_type;
|
||||
typedef int arg_type;
|
||||
enum { SIZE = 1 };
|
||||
arg_type load(const short* ptr) { return *ptr; }
|
||||
void store(short* ptr, arg_type val) { *ptr = (short)val; }
|
||||
void operator()(arg_type& a, arg_type& b) const
|
||||
@ -541,7 +538,6 @@ struct MinMax32f
|
||||
{
|
||||
typedef float value_type;
|
||||
typedef float arg_type;
|
||||
enum { SIZE = 1 };
|
||||
arg_type load(const float* ptr) { return *ptr; }
|
||||
void store(float* ptr, arg_type val) { *ptr = val; }
|
||||
void operator()(arg_type& a, arg_type& b) const
|
||||
@ -552,14 +548,13 @@ struct MinMax32f
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_SIMD
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
|
||||
struct MinMaxVec8u
|
||||
{
|
||||
typedef uchar value_type;
|
||||
typedef v_uint8x16 arg_type;
|
||||
enum { SIZE = v_uint8x16::nlanes };
|
||||
arg_type load(const uchar* ptr) { return v_load(ptr); }
|
||||
typedef v_uint8 arg_type;
|
||||
arg_type load(const uchar* ptr) { return vx_load(ptr); }
|
||||
void store(uchar* ptr, const arg_type &val) { v_store(ptr, val); }
|
||||
void operator()(arg_type& a, arg_type& b) const
|
||||
{
|
||||
@ -567,27 +562,14 @@ struct MinMaxVec8u
|
||||
a = v_min(a, b);
|
||||
b = v_max(b, t);
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
typedef v_uint8 warg_type;
|
||||
enum { WSIZE = v_uint8::nlanes };
|
||||
warg_type wload(const uchar* ptr) { return vx_load(ptr); }
|
||||
void store(uchar* ptr, const warg_type &val) { v_store(ptr, val); }
|
||||
void operator()(warg_type& a, warg_type& b) const
|
||||
{
|
||||
warg_type t = a;
|
||||
a = v_min(a, b);
|
||||
b = v_max(b, t);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
struct MinMaxVec16u
|
||||
{
|
||||
typedef ushort value_type;
|
||||
typedef v_uint16x8 arg_type;
|
||||
enum { SIZE = v_uint16x8::nlanes };
|
||||
arg_type load(const ushort* ptr) { return v_load(ptr); }
|
||||
typedef v_uint16 arg_type;
|
||||
arg_type load(const ushort* ptr) { return vx_load(ptr); }
|
||||
void store(ushort* ptr, const arg_type &val) { v_store(ptr, val); }
|
||||
void operator()(arg_type& a, arg_type& b) const
|
||||
{
|
||||
@ -595,27 +577,14 @@ struct MinMaxVec16u
|
||||
a = v_min(a, b);
|
||||
b = v_max(b, t);
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
typedef v_uint16 warg_type;
|
||||
enum { WSIZE = v_uint16::nlanes };
|
||||
warg_type wload(const ushort* ptr) { return vx_load(ptr); }
|
||||
void store(ushort* ptr, const warg_type &val) { v_store(ptr, val); }
|
||||
void operator()(warg_type& a, warg_type& b) const
|
||||
{
|
||||
warg_type t = a;
|
||||
a = v_min(a, b);
|
||||
b = v_max(b, t);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
struct MinMaxVec16s
|
||||
{
|
||||
typedef short value_type;
|
||||
typedef v_int16x8 arg_type;
|
||||
enum { SIZE = v_int16x8::nlanes };
|
||||
arg_type load(const short* ptr) { return v_load(ptr); }
|
||||
typedef v_int16 arg_type;
|
||||
arg_type load(const short* ptr) { return vx_load(ptr); }
|
||||
void store(short* ptr, const arg_type &val) { v_store(ptr, val); }
|
||||
void operator()(arg_type& a, arg_type& b) const
|
||||
{
|
||||
@ -623,27 +592,14 @@ struct MinMaxVec16s
|
||||
a = v_min(a, b);
|
||||
b = v_max(b, t);
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
typedef v_int16 warg_type;
|
||||
enum { WSIZE = v_int16::nlanes };
|
||||
warg_type wload(const short* ptr) { return vx_load(ptr); }
|
||||
void store(short* ptr, const warg_type &val) { v_store(ptr, val); }
|
||||
void operator()(warg_type& a, warg_type& b) const
|
||||
{
|
||||
warg_type t = a;
|
||||
a = v_min(a, b);
|
||||
b = v_max(b, t);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
struct MinMaxVec32f
|
||||
{
|
||||
typedef float value_type;
|
||||
typedef v_float32x4 arg_type;
|
||||
enum { SIZE = v_float32x4::nlanes };
|
||||
arg_type load(const float* ptr) { return v_load(ptr); }
|
||||
typedef v_float32 arg_type;
|
||||
arg_type load(const float* ptr) { return vx_load(ptr); }
|
||||
void store(float* ptr, const arg_type &val) { v_store(ptr, val); }
|
||||
void operator()(arg_type& a, arg_type& b) const
|
||||
{
|
||||
@ -651,18 +607,6 @@ struct MinMaxVec32f
|
||||
a = v_min(a, b);
|
||||
b = v_max(b, t);
|
||||
}
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
typedef v_float32 warg_type;
|
||||
enum { WSIZE = v_float32::nlanes };
|
||||
warg_type wload(const float* ptr) { return vx_load(ptr); }
|
||||
void store(float* ptr, const warg_type &val) { v_store(ptr, val); }
|
||||
void operator()(warg_type& a, warg_type& b) const
|
||||
{
|
||||
warg_type t = a;
|
||||
a = v_min(a, b);
|
||||
b = v_max(b, t);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
#else
|
||||
@ -683,9 +627,6 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
|
||||
typedef typename Op::value_type T;
|
||||
typedef typename Op::arg_type WT;
|
||||
typedef typename VecOp::arg_type VT;
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
typedef typename VecOp::warg_type WVT;
|
||||
#endif
|
||||
|
||||
const T* src = _src.ptr<T>();
|
||||
T* dst = _dst.ptr<T>();
|
||||
@ -747,22 +688,12 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
|
||||
if( limit == size.width )
|
||||
break;
|
||||
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
for( ; j <= size.width - VecOp::WSIZE - cn; j += VecOp::WSIZE )
|
||||
{
|
||||
WVT p0 = vop.wload(row0+j-cn), p1 = vop.wload(row0+j), p2 = vop.wload(row0+j+cn);
|
||||
WVT p3 = vop.wload(row1+j-cn), p4 = vop.wload(row1+j), p5 = vop.wload(row1+j+cn);
|
||||
WVT p6 = vop.wload(row2+j-cn), p7 = vop.wload(row2+j), p8 = vop.wload(row2+j+cn);
|
||||
|
||||
vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
|
||||
vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
|
||||
vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
|
||||
vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
|
||||
vop(p4, p2); vop(p6, p4); vop(p4, p2);
|
||||
vop.store(dst+j, p4);
|
||||
}
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
|
||||
#else
|
||||
int nlanes = 1;
|
||||
#endif
|
||||
for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE )
|
||||
for( ; j <= size.width - nlanes - cn; j += nlanes )
|
||||
{
|
||||
VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn);
|
||||
VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn);
|
||||
@ -862,79 +793,43 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
|
||||
if( limit == size.width )
|
||||
break;
|
||||
|
||||
#if CV_SIMD_WIDTH > 16
|
||||
for( ; j <= size.width - VecOp::WSIZE - cn*2; j += VecOp::WSIZE )
|
||||
{
|
||||
WVT p[25];
|
||||
for( k = 0; k < 5; k++ )
|
||||
{
|
||||
const T* rowk = row[k];
|
||||
p[k*5] = vop.wload(rowk+j-cn*2); p[k*5+1] = vop.wload(rowk+j-cn);
|
||||
p[k*5+2] = vop.wload(rowk+j); p[k*5+3] = vop.wload(rowk+j+cn);
|
||||
p[k*5+4] = vop.wload(rowk+j+cn*2);
|
||||
}
|
||||
|
||||
vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]);
|
||||
vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]);
|
||||
vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]);
|
||||
vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]);
|
||||
vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]);
|
||||
vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]);
|
||||
vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]);
|
||||
vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]);
|
||||
vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]);
|
||||
vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]);
|
||||
vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]);
|
||||
vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]);
|
||||
vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]);
|
||||
vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]);
|
||||
vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]);
|
||||
vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]);
|
||||
vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]);
|
||||
vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]);
|
||||
vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]);
|
||||
vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]);
|
||||
vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]);
|
||||
vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]);
|
||||
vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]);
|
||||
vop.store(dst+j, p[12]);
|
||||
}
|
||||
#if CV_SIMD || CV_SIMD_SCALABLE
|
||||
int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
|
||||
#else
|
||||
int nlanes = 1;
|
||||
#endif
|
||||
for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE )
|
||||
for( ; j <= size.width - nlanes - cn*2; j += nlanes )
|
||||
{
|
||||
VT p[25];
|
||||
for( k = 0; k < 5; k++ )
|
||||
{
|
||||
const T* rowk = row[k];
|
||||
p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn);
|
||||
p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn);
|
||||
p[k*5+4] = vop.load(rowk+j+cn*2);
|
||||
}
|
||||
VT p0 = vop.load(row[0]+j-cn*2), p5 = vop.load(row[1]+j-cn*2), p10 = vop.load(row[2]+j-cn*2), p15 = vop.load(row[3]+j-cn*2), p20 = vop.load(row[4]+j-cn*2);
|
||||
VT p1 = vop.load(row[0]+j-cn*1), p6 = vop.load(row[1]+j-cn*1), p11 = vop.load(row[2]+j-cn*1), p16 = vop.load(row[3]+j-cn*1), p21 = vop.load(row[4]+j-cn*1);
|
||||
VT p2 = vop.load(row[0]+j-cn*0), p7 = vop.load(row[1]+j-cn*0), p12 = vop.load(row[2]+j-cn*0), p17 = vop.load(row[3]+j-cn*0), p22 = vop.load(row[4]+j-cn*0);
|
||||
VT p3 = vop.load(row[0]+j+cn*1), p8 = vop.load(row[1]+j+cn*1), p13 = vop.load(row[2]+j+cn*1), p18 = vop.load(row[3]+j+cn*1), p23 = vop.load(row[4]+j+cn*1);
|
||||
VT p4 = vop.load(row[0]+j+cn*2), p9 = vop.load(row[1]+j+cn*2), p14 = vop.load(row[2]+j+cn*2), p19 = vop.load(row[3]+j+cn*2), p24 = vop.load(row[4]+j+cn*2);
|
||||
|
||||
vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]);
|
||||
vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]);
|
||||
vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]);
|
||||
vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]);
|
||||
vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]);
|
||||
vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]);
|
||||
vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]);
|
||||
vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]);
|
||||
vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]);
|
||||
vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]);
|
||||
vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]);
|
||||
vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]);
|
||||
vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]);
|
||||
vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]);
|
||||
vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]);
|
||||
vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]);
|
||||
vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]);
|
||||
vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]);
|
||||
vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]);
|
||||
vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]);
|
||||
vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]);
|
||||
vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]);
|
||||
vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]);
|
||||
vop.store(dst+j, p[12]);
|
||||
vop(p1, p2); vop(p0, p1); vop(p1, p2); vop(p4, p5); vop(p3, p4);
|
||||
vop(p4, p5); vop(p0, p3); vop(p2, p5); vop(p2, p3); vop(p1, p4);
|
||||
vop(p1, p2); vop(p3, p4); vop(p7, p8); vop(p6, p7); vop(p7, p8);
|
||||
vop(p10, p11); vop(p9, p10); vop(p10, p11); vop(p6, p9); vop(p8, p11);
|
||||
vop(p8, p9); vop(p7, p10); vop(p7, p8); vop(p9, p10); vop(p0, p6);
|
||||
vop(p4, p10); vop(p4, p6); vop(p2, p8); vop(p2, p4); vop(p6, p8);
|
||||
vop(p1, p7); vop(p5, p11); vop(p5, p7); vop(p3, p9); vop(p3, p5);
|
||||
vop(p7, p9); vop(p1, p2); vop(p3, p4); vop(p5, p6); vop(p7, p8);
|
||||
vop(p9, p10); vop(p13, p14); vop(p12, p13); vop(p13, p14); vop(p16, p17);
|
||||
vop(p15, p16); vop(p16, p17); vop(p12, p15); vop(p14, p17); vop(p14, p15);
|
||||
vop(p13, p16); vop(p13, p14); vop(p15, p16); vop(p19, p20); vop(p18, p19);
|
||||
vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p21, p23); vop(p22, p24);
|
||||
vop(p22, p23); vop(p18, p21); vop(p20, p23); vop(p20, p21); vop(p19, p22);
|
||||
vop(p22, p24); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p12, p18);
|
||||
vop(p16, p22); vop(p16, p18); vop(p14, p20); vop(p20, p24); vop(p14, p16);
|
||||
vop(p18, p20); vop(p22, p24); vop(p13, p19); vop(p17, p23); vop(p17, p19);
|
||||
vop(p15, p21); vop(p15, p17); vop(p19, p21); vop(p13, p14); vop(p15, p16);
|
||||
vop(p17, p18); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p0, p12);
|
||||
vop(p8, p20); vop(p8, p12); vop(p4, p16); vop(p16, p24); vop(p12, p16);
|
||||
vop(p2, p14); vop(p10, p22); vop(p10, p14); vop(p6, p18); vop(p6, p10);
|
||||
vop(p10, p12); vop(p1, p13); vop(p9, p21); vop(p9, p13); vop(p5, p17);
|
||||
vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19);
|
||||
vop(p7, p11); vop(p11, p13); vop(p11, p12);
|
||||
vop.store(dst+j, p12);
|
||||
}
|
||||
|
||||
limit = size.width;
|
||||
|
@ -22,6 +22,10 @@ set(CMAKE_CXX_FLAGS "-march=rv64gcv --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w
|
||||
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
|
||||
OPTION(RISCV_RVV_SCALABLE "Use scalable RVV API on RISC-V" ON) # Enabled by default
|
||||
IF(RISCV_RVV_SCALABLE)
|
||||
ADD_DEFINITIONS(-DCV_RVV_SCALABLE)
|
||||
ENDIF()
|
||||
|
||||
set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT})
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
|
Loading…
Reference in New Issue
Block a user