Merge pull request #22179 from hanliutong:new-rvv

[GSoC] New universal intrinsic backend for RVV

* Add new rvv backend (partially implemented).

* Modify the framework of Universal Intrinsic.

* Add CV_SIMD macro guards to current UI code.

* Use vlanes() instead of nlanes.

* Modify the UI test.

* Enable the new RVV (scalable) backend.

* Remove whitespace.

* Rename and some others modify.

* Update intrin.hpp but still not work on AVX/SSE

* Update conditional compilation macros.

* Use static variable for vlanes.

* Use max_nlanes for array defining.
This commit is contained in:
HAN Liutong 2022-07-20 01:02:00 +08:00 committed by GitHub
parent 3c23a44786
commit 0ef803950b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 1484 additions and 358 deletions

View File

@ -177,7 +177,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
{ {
int x, c, width = img1.cols, cn = img1.channels(); int x, c, width = img1.cols, cn = img1.channels();
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1; int D = (int)alignSize(maxD - minD, VTraits<v_int16>::vlanes()), width1 = maxX1 - minX1;
//This minX1 & maxX2 correction is defining which part of calculatable line must be calculated //This minX1 & maxX2 correction is defining which part of calculatable line must be calculated
//That is needs of parallel algorithm //That is needs of parallel algorithm
xrange_min = (xrange_min < 0) ? 0: xrange_min; xrange_min = (xrange_min < 0) ? 0: xrange_min;
@ -502,8 +502,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
const int D = params.numDisparities; const int D = params.numDisparities;
int width1 = maxX1 - minX1; int width1 = maxX1 - minX1;
int Da = (int)alignSize(D, v_int16::nlanes); int Da = (int)alignSize(D,VTraits<v_int16>::vlanes());
int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D int Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2; int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2;
int npasses = params.isFullDP() ? 2 : 1; int npasses = params.isFullDP() ? 2 : 1;
@ -977,11 +977,10 @@ struct CalcVerticalSums: public ParallelLoopBody
width = img1.cols; width = img1.cols;
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
D = maxD - minD; D = maxD - minD;
Da = (int)alignSize(D, v_int16::nlanes); Da = (int)alignSize(D, VTraits<v_int16>::vlanes());
Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
width1 = maxX1 - minX1; width1 = maxX1 - minX1;
D = params.numDisparities; D = params.numDisparities;
Da = (int)alignSize(D, v_int16::nlanes);
} }
void operator()(const Range& range) const CV_OVERRIDE void operator()(const Range& range) const CV_OVERRIDE
@ -1235,8 +1234,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
INVALID_DISP = minD - 1; INVALID_DISP = minD - 1;
INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
D = maxD - minD; D = maxD - minD;
Da = (int)alignSize(D, v_int16::nlanes); Da = (int)alignSize(D, VTraits<v_int16>::vlanes());
Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
width1 = maxX1 - minX1; width1 = maxX1 - minX1;
} }
@ -1484,8 +1483,8 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
int width = disp1.cols, height = disp1.rows; int width = disp1.cols, height = disp1.rows;
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
int width1 = maxX1 - minX1; int width1 = maxX1 - minX1;
int Da = (int)alignSize(params.numDisparities, v_int16::nlanes); int Da = (int)alignSize(params.numDisparities, VTraits<v_int16>::vlanes());
int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D int Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
int INVALID_DISP = minD - 1; int INVALID_DISP = minD - 1;
int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
@ -1630,7 +1629,7 @@ SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
width = img1->cols; height = img1->rows; width = img1->cols; height = img1->rows;
minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD; minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1; minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
Da = (int)alignSize(D, v_int16::nlanes); Da = (int)alignSize(D, VTraits<v_int16>::vlanes());
SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1; SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;

View File

@ -200,7 +200,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
# undef CV_RVV # undef CV_RVV
#endif #endif
#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP) #if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071) && !defined(CV_FORCE_SIMD128_CPP)
#define CV__SIMD_FORWARD 128 #define CV__SIMD_FORWARD 128
#include "opencv2/core/hal/intrin_forward.hpp" #include "opencv2/core/hal/intrin_forward.hpp"
#endif #endif
@ -229,9 +229,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP) #elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_wasm.hpp" #include "opencv2/core/hal/intrin_wasm.hpp"
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_SCALABLE)
#include "opencv2/core/hal/intrin_rvv.hpp" #include "opencv2/core/hal/intrin_rvv.hpp"
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_SCALABLE
#include "opencv2/core/hal/intrin_rvv_scalable.hpp"
#else #else
#include "opencv2/core/hal/intrin_cpp.hpp" #include "opencv2/core/hal/intrin_cpp.hpp"
@ -314,6 +315,14 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_SIMD512_FP16 0 #define CV_SIMD512_FP16 0
#endif #endif
#ifndef CV_SIMD_SCALABLE
#define CV_SIMD_SCALABLE 0
#endif
#ifndef CV_SIMD_SCALABLE_64F
#define CV_SIMD_SCALABLE_64F 0
#endif
//================================================================================================== //==================================================================================================
template<typename _Tp> struct V_RegTraits template<typename _Tp> struct V_RegTraits
@ -375,6 +384,18 @@ template<typename _Tp> struct V_RegTraits
CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void); CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16); CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
#endif #endif
#if CV_SIMD_SCALABLE
CV_DEF_REG_TRAITS(v, v_uint8, uchar, u8, v_uint8, v_uint16, v_uint32, v_int8, void);
CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void);
CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void);
CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void);
CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void);
CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void);
CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32);
CV_DEF_REG_TRAITS(v, v_uint64, uint64, u64, v_uint64, void, void, v_int64, void);
CV_DEF_REG_TRAITS(v, v_int64, int64, s64, v_uint64, void, void, v_int64, void);
CV_DEF_REG_TRAITS(v, v_float64, double, f64, v_float64, void, void, v_int64, v_int32);
#endif
//! @endcond //! @endcond
#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512) #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
@ -488,6 +509,17 @@ namespace CV__SIMD_NAMESPACE {
#define VXPREFIX(func) v##func #define VXPREFIX(func) v##func
} // namespace } // namespace
using namespace CV__SIMD_NAMESPACE; using namespace CV__SIMD_NAMESPACE;
#elif CV_SIMD_SCALABLE
#define CV__SIMD_NAMESPACE simd
namespace CV__SIMD_NAMESPACE {
#define CV_SIMD 0
#define CV_SIMD_WIDTH 128 /* 1024/8 */
#define VXPREFIX(func) v##func
} // namespace
using namespace CV__SIMD_NAMESPACE;
#endif #endif
namespace CV__SIMD_NAMESPACE { namespace CV__SIMD_NAMESPACE {
@ -663,6 +695,402 @@ namespace CV__SIMD_NAMESPACE {
/** @brief SIMD processing state cleanup call */ /** @brief SIMD processing state cleanup call */
inline void vx_cleanup() { VXPREFIX(_cleanup)(); } inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
#if CV_SIMD
// Compatibility layer
#define CV_SIMD_SCALABLE 0
#define CV_SIMD_SCALABLE_64F 0
template <class T>
struct VTraits;
#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
template <>
struct VTraits<v_uint8>
{
static inline int vlanes() { return v_uint8::nlanes; }
enum { nlanes = 64, max_nlanes = nlanes };
using lane_type = uchar;
};
template <>
struct VTraits<v_int8>
{
static inline int vlanes() { return v_int8::nlanes; }
enum { nlanes = 64, max_nlanes = nlanes };
using lane_type = schar;
};
template <>
struct VTraits<v_uint16>
{
static inline int vlanes() { return v_uint16::nlanes; }
enum { nlanes = 32, max_nlanes = nlanes };
using lane_type = ushort;
};
template <>
struct VTraits<v_int16>
{
static inline int vlanes() { return v_int16::nlanes; }
enum { nlanes = 32, max_nlanes = nlanes };
using lane_type = short;
};
template <>
struct VTraits<v_uint32>
{
static inline int vlanes() { return v_uint32::nlanes; }
enum { nlanes = 16, max_nlanes = nlanes };
using lane_type = uint;
};
template <>
struct VTraits<v_int32>
{
static inline int vlanes() { return v_int32::nlanes; }
enum { nlanes = 16, max_nlanes = nlanes };
using lane_type = int;
};
template <>
struct VTraits<v_float32>
{
static inline int vlanes() { return v_float32::nlanes; }
enum { nlanes = 16, max_nlanes = nlanes };
using lane_type = float;
};
template <>
struct VTraits<v_uint64>
{
static inline int vlanes() { return v_uint64::nlanes; }
enum { nlanes = 8, max_nlanes = nlanes };
using lane_type = uint64;
};
template <>
struct VTraits<v_int64>
{
static inline int vlanes() { return v_int64::nlanes; }
enum { nlanes = 8, max_nlanes = nlanes };
using lane_type = int64;
};
#if CV_SIMD_64F
template <>
struct VTraits<v_float64>
{
static inline int vlanes() { return v_float64::nlanes; }
enum { nlanes = 8, max_nlanes = nlanes };
using lane_type = double;
};
#endif
#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
template <>
struct VTraits<v_uint8>
{
static inline int vlanes() { return v_uint8::nlanes; }
enum { nlanes = 32, max_nlanes = nlanes };
using lane_type = uchar;
};
template <>
struct VTraits<v_int8>
{
static inline int vlanes() { return v_int8::nlanes; }
enum { nlanes = 32, max_nlanes = nlanes };
using lane_type = schar;
};
template <>
struct VTraits<v_uint16>
{
static inline int vlanes() { return v_uint16::nlanes; }
enum { nlanes = 16, max_nlanes = nlanes };
using lane_type = ushort;
};
template <>
struct VTraits<v_int16>
{
static inline int vlanes() { return v_int16::nlanes; }
enum { nlanes = 16, max_nlanes = nlanes };
using lane_type = short;
};
template <>
struct VTraits<v_uint32>
{
static inline int vlanes() { return v_uint32::nlanes; }
enum { nlanes = 8, max_nlanes = nlanes };
using lane_type = uint;
};
template <>
struct VTraits<v_int32>
{
static inline int vlanes() { return v_int32::nlanes; }
enum { nlanes = 8, max_nlanes = nlanes };
using lane_type = int;
};
template <>
struct VTraits<v_float32>
{
static inline int vlanes() { return v_float32::nlanes; }
enum { nlanes = 8, max_nlanes = nlanes };
using lane_type = float;
};
template <>
struct VTraits<v_uint64>
{
static inline int vlanes() { return v_uint64::nlanes; }
enum { nlanes = 4, max_nlanes = nlanes };
using lane_type = uint64;
};
template <>
struct VTraits<v_int64>
{
static inline int vlanes() { return v_int64::nlanes; }
enum { nlanes = 4, max_nlanes = nlanes };
using lane_type = int64;
};
#if CV_SIMD_64F
template <>
struct VTraits<v_float64>
{
static inline int vlanes() { return v_float64::nlanes; }
enum { nlanes = 4, max_nlanes = nlanes };
using lane_type = double;
};
#endif
#elif CV_SIMD128 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
template <>
struct VTraits<v_uint8>
{
static inline int vlanes() { return v_uint8::nlanes; }
enum { nlanes = 16, max_nlanes = nlanes };
using lane_type = uchar;
};
template <>
struct VTraits<v_int8>
{
static inline int vlanes() { return v_int8::nlanes; }
enum { nlanes = 16, max_nlanes = nlanes };
using lane_type = schar;
};
template <>
struct VTraits<v_uint16>
{
static inline int vlanes() { return v_uint16::nlanes; }
enum { nlanes = 8, max_nlanes = nlanes };
using lane_type = ushort;
};
template <>
struct VTraits<v_int16>
{
static inline int vlanes() { return v_int16::nlanes; }
enum { nlanes = 8, max_nlanes = nlanes };
using lane_type = short;
};
template <>
struct VTraits<v_uint32>
{
static inline int vlanes() { return v_uint32::nlanes; }
enum { nlanes = 4, max_nlanes = nlanes };
using lane_type = uint;
};
template <>
struct VTraits<v_int32>
{
static inline int vlanes() { return v_int32::nlanes; }
enum { nlanes = 4, max_nlanes = nlanes };
using lane_type = int;
};
template <>
struct VTraits<v_float32>
{
static inline int vlanes() { return v_float32::nlanes; }
enum { nlanes = 4, max_nlanes = nlanes };
using lane_type = float;
};
template <>
struct VTraits<v_uint64>
{
static inline int vlanes() { return v_uint64::nlanes; }
enum { nlanes = 2, max_nlanes = nlanes };
using lane_type = uint64;
};
template <>
struct VTraits<v_int64>
{
static inline int vlanes() { return v_int64::nlanes; }
enum { nlanes = 2, max_nlanes = nlanes };
using lane_type = int64;
};
#if CV_SIMD_64F
template <>
struct VTraits<v_float64>
{
static inline int vlanes() { return v_float64::nlanes; }
enum { nlanes = 2, max_nlanes = nlanes };
using lane_type = double;
};
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \
{ \
return a + b; \
} \
inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \
{ \
return a - b; \
} \
template<typename... Args> \
inline _Tpvec v_add(_Tpvec f1, _Tpvec f2, Args... vf) { \
return v_add(f1 + f2, vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
#endif
#define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
{ \
return a & b; \
} \
inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
{ \
return a | b; \
} \
inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
{ \
return a ^ b; \
} \
inline _Tpvec v_not(const _Tpvec& a) \
{ \
return ~a; \
}
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
{ \
return a * b; \
} \
template<typename... Args> \
inline _Tpvec v_mul(_Tpvec f1, _Tpvec f2, Args... vf) { \
return v_mul(f1 * f2, vf...); \
}
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
#endif
inline v_float32 v_div(const v_float32& a, const v_float32& b) \
{ \
return a / b; \
}
#if CV_SIMD_64F
inline v_float64 v_div(const v_float64& a, const v_float64& b) \
{ \
return a / b; \
}
#endif
#define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \
{ \
return a op b; \
}
#define OPENCV_HAL_WRAP_CMP(_Tpvec) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \
OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=)
OPENCV_HAL_WRAP_CMP(v_uint8)
OPENCV_HAL_WRAP_CMP(v_uint16)
OPENCV_HAL_WRAP_CMP(v_uint32)
// OPENCV_HAL_WRAP_CMP(v_uint64)
OPENCV_HAL_WRAP_CMP(v_int8)
OPENCV_HAL_WRAP_CMP(v_int16)
OPENCV_HAL_WRAP_CMP(v_int32)
// OPENCV_HAL_WRAP_CMP(v_int64)
OPENCV_HAL_WRAP_CMP(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64)
#endif
//////////// get0 ////////////
#define OPENCV_HAL_WRAP_GRT0_INT(_Tpvec, _Tp) \
inline _Tp v_get0(v_##_Tpvec v) \
{ \
return v.get0(); \
}
OPENCV_HAL_WRAP_GRT0_INT(uint8, uchar)
OPENCV_HAL_WRAP_GRT0_INT(int8, schar)
OPENCV_HAL_WRAP_GRT0_INT(uint16, ushort)
OPENCV_HAL_WRAP_GRT0_INT(int16, short)
OPENCV_HAL_WRAP_GRT0_INT(uint32, unsigned)
OPENCV_HAL_WRAP_GRT0_INT(int32, int)
OPENCV_HAL_WRAP_GRT0_INT(uint64, uint64)
OPENCV_HAL_WRAP_GRT0_INT(int64, int64)
OPENCV_HAL_WRAP_GRT0_INT(float32, float)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0_INT(float64, double)
#endif
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec, _Tp, vl) \
inline _Tp v_extract_highest(_Tpvec v) \
{ \
return v_extract_n<vl-1>(v); \
}
OPENCV_HAL_WRAP_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_int8, schar, VTraits<v_int8>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_int16, short, VTraits<v_int16>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_int32, int, VTraits<v_int32>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_int64, int64, VTraits<v_int64>::nlanes)
OPENCV_HAL_WRAP_EXTRACT(v_float32, float, VTraits<v_float32>::nlanes)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64, double, VTraits<v_float64>::nlanes)
#endif
#define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
inline _Tpvec v_broadcast_highest(_Tpvec v) \
{ \
return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
}
OPENCV_HAL_WRAP_BROADCAST(v_uint32)
OPENCV_HAL_WRAP_BROADCAST(v_int32)
OPENCV_HAL_WRAP_BROADCAST(v_float32)
#endif //CV_SIMD
//! @cond IGNORED //! @cond IGNORED

View File

@ -0,0 +1,493 @@
#ifndef OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
#define OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
#include <initializer_list>
#include <assert.h>
#include <vector>
#ifndef CV_RVV_MAX_VLEN
#define CV_RVV_MAX_VLEN 1024
#endif
namespace cv
{
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_SIMD_SCALABLE 1
#define CV_SIMD_SCALABLE_64F 1
using v_uint8 = vuint8m1_t;
using v_int8 = vint8m1_t;
using v_uint16 = vuint16m1_t;
using v_int16 = vint16m1_t;
using v_uint32 = vuint32m1_t;
using v_int32 = vint32m1_t;
using v_uint64 = vuint64m1_t;
using v_int64 = vint64m1_t;
using v_float32 = vfloat32m1_t;
#if CV_SIMD_SCALABLE_64F
using v_float64 = vfloat64m1_t;
#endif
using uchar = unsigned char;
using schar = signed char;
using ushort = unsigned short;
using uint = unsigned int;
using uint64 = unsigned long int;
using int64 = long int;
static const int __cv_rvv_e8_nlanes = vsetvlmax_e8m1();
static const int __cv_rvv_e16_nlanes = vsetvlmax_e16m1();
static const int __cv_rvv_e32_nlanes = vsetvlmax_e32m1();
static const int __cv_rvv_e64_nlanes = vsetvlmax_e64m1();
template <class T>
struct VTraits;
template <>
struct VTraits<v_uint8>
{
static inline int vlanes() { return __cv_rvv_e8_nlanes; }
using lane_type = uchar;
static const int max_nlanes = CV_RVV_MAX_VLEN/8;
};
template <>
struct VTraits<v_int8>
{
static inline int vlanes() { return __cv_rvv_e8_nlanes; }
using lane_type = schar;
static const int max_nlanes = CV_RVV_MAX_VLEN/8;
};
template <>
struct VTraits<v_uint16>
{
static inline int vlanes() { return __cv_rvv_e16_nlanes; }
using lane_type = ushort;
static const int max_nlanes = CV_RVV_MAX_VLEN/16;
};
template <>
struct VTraits<v_int16>
{
static inline int vlanes() { return __cv_rvv_e16_nlanes; }
using lane_type = short;
static const int max_nlanes = CV_RVV_MAX_VLEN/16;
};
template <>
struct VTraits<v_uint32>
{
static inline int vlanes() { return __cv_rvv_e32_nlanes; }
using lane_type = uint;
static const int max_nlanes = CV_RVV_MAX_VLEN/32;
};
template <>
struct VTraits<v_int32>
{
static inline int vlanes() { return __cv_rvv_e32_nlanes; }
using lane_type = int;
static const int max_nlanes = CV_RVV_MAX_VLEN/32;
};
template <>
struct VTraits<v_float32>
{
static inline int vlanes() { return __cv_rvv_e32_nlanes; }
using lane_type = float;
static const int max_nlanes = CV_RVV_MAX_VLEN/32;
};
template <>
struct VTraits<v_uint64>
{
static inline int vlanes() { return __cv_rvv_e64_nlanes; }
using lane_type = uint64;
static const int max_nlanes = CV_RVV_MAX_VLEN/64;
};
template <>
struct VTraits<v_int64>
{
static inline int vlanes() { return __cv_rvv_e64_nlanes; }
using lane_type = int64;
static const int max_nlanes = CV_RVV_MAX_VLEN/64;
};
#if CV_SIMD_SCALABLE_64F
template <>
struct VTraits<v_float64>
{
static inline int vlanes() { return __cv_rvv_e64_nlanes; }
using lane_type = double;
static const int max_nlanes = CV_RVV_MAX_VLEN/64;
};
#endif
//////////// get0 ////////////
#define OPENCV_HAL_IMPL_RVV_GRT0_INT(_Tpvec, _Tp) \
inline _Tp v_get0(v_##_Tpvec v) \
{ \
return vmv_x(v); \
}
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint8, uchar)
OPENCV_HAL_IMPL_RVV_GRT0_INT(int8, schar)
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint16, ushort)
OPENCV_HAL_IMPL_RVV_GRT0_INT(int16, short)
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint32, unsigned)
OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int)
OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64)
OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64)
inline float v_get0(v_float32 v) \
{ \
return vfmv_f(v); \
}
#if CV_SIMD_SCALABLE_64F
inline double v_get0(v_float64 v) \
{ \
return vfmv_f(v); \
}
#endif
//////////// Initial ////////////
#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
inline v_##_Tpvec v_setzero_##suffix1() \
{ \
return vmv_v_x_##suffix2##m1(0, vl); \
} \
inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
{ \
return vmv_v_x_##suffix2##m1(v, vl); \
}
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8, uchar, u8, u8, VTraits<v_uint8>::vlanes())
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8, schar, s8, i8, VTraits<v_int8>::vlanes())
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16, ushort, u16, u16, VTraits<v_uint16>::vlanes())
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16, short, s16, i16, VTraits<v_int16>::vlanes())
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32, uint, u32, u32, VTraits<v_uint32>::vlanes())
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32, int, s32, i32, VTraits<v_int32>::vlanes())
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64, uint64, u64, u64, VTraits<v_uint64>::vlanes())
OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64, int64, s64, i64, VTraits<v_int64>::vlanes())
#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
inline v_##_Tpv v_setzero_##suffix() \
{ \
return vfmv_v_f_##suffix##m1(0, vl); \
} \
inline v_##_Tpv v_setall_##suffix(_Tp v) \
{ \
return vfmv_v_f_##suffix##m1(v, vl); \
}
OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits<v_float32>::vlanes())
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits<v_float64>::vlanes())
#endif
//////////// Reinterpret ////////////
#define OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(_Tpvec1, suffix1) \
inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec1& v) \
{ \
return v;\
}
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint8, u8)
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint16, u16)
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint32, u32)
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint64, u64)
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8)
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16)
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32)
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64)
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64)
#endif
// TODO: can be simplified by using overloaded RV intrinsic
#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
{ \
return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
} \
inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
{ \
return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
}
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, float64, u64, f64, u64, f64)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64, float64, s64, f64, i64, f64)
#endif
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint16, u8, u16, u8, u16)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint32, u8, u32, u8, u32)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint64, u8, u64, u8, u64)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint32, u16, u32, u16, u32)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint64, u16, u64, u16, u64)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, uint64, u32, u64, u32, u64)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int16, s8, s16, i8, i16)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int32, s8, s32, i8, i32)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int64, s8, s64, i8, i64)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int32, s16, s32, i16, i32)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int64, s16, s64, i16, i64)
OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, int64, s32, s64, i32, i64)
#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
{ \
return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v));\
} \
inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
{ \
return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v));\
}
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int16, u8, s16, u, i, 8, 16)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int32, u8, s32, u, i, 8, 32)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int64, u8, s64, u, i, 8, 64)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int8, u16, s8, u, i, 16, 8)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int32, u16, s32, u, i, 16, 32)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int64, u16, s64, u, i, 16, 64)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int8, u32, s8, u, i, 32, 8)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int16, u32, s16, u, i, 32, 16)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float32, s8, f32, i, f, 8, 32)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float32, s16, f32, i, f, 16, 32)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float32, s64, f32, i, f, 64, 32)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float64, u8, f64, u, f, 8, 64)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float64, u16, f64, u, f, 16, 64)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float64, u32, f64, u, f, 32, 64)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64)
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64)
// Three times reinterpret
inline v_float32 v_reinterpret_as_f32(const v_float64& v) \
{ \
return vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v)));\
}
inline v_float64 v_reinterpret_as_f64(const v_float32& v) \
{ \
return vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v)));\
}
#endif
////////////// Load/Store //////////////
#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
inline _Tpvec v_load(const _Tp* ptr) \
{ \
return vle##width##_v_##suffix##m1(ptr, vl); \
} \
inline _Tpvec v_load_aligned(const _Tp* ptr) \
{ \
return vle##width##_v_##suffix##m1(ptr, vl); \
} \
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
{ \
vse##width##_v_##suffix##m1(ptr, a, vl); \
} \
inline _Tpvec v_load_low(const _Tp* ptr) \
{ \
return vle##width##_v_##suffix##m1(ptr, hvl); \
} \
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
{ \
return vslideup(vle##width##_v_##suffix##m1(ptr0, hvl), vle##width##_v_##suffix##m1(ptr1, hvl), hvl, vl); \
} \
inline void v_store(_Tp* ptr, const _Tpvec& a) \
{ \
vse##width(ptr, a, vl); \
} \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ \
vse##width(ptr, a, vl); \
} \
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
{ \
vse##width(ptr, a, vl); \
} \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ \
vse##width(ptr, a, hvl); \
} \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
{ \
vse##width(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
} \
inline _Tpvec v_load(std::initializer_list<_Tp> nScalars) \
{ \
assert(nScalars.size() == vl); \
return vle##width##_v_##suffix##m1(nScalars.begin(), nScalars.size()); \
} \
template<typename... Targs> \
_Tpvec v_load_##suffix(Targs... nScalars) \
{ \
return v_load({nScalars...}); \
}
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits<v_uint8>::vlanes() / 2, VTraits<v_uint8>::vlanes(), 8, u8, vmv_v_x_u8m1)
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits<v_int8>::vlanes() / 2, VTraits<v_int8>::vlanes(), 8, i8, vmv_v_x_i8m1)
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16, vuint16m1_t, ushort, VTraits<v_uint16>::vlanes() / 2, VTraits<v_uint16>::vlanes(), 16, u16, vmv_v_x_u16m1)
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16, vint16m1_t, short, VTraits<v_int16>::vlanes() / 2, VTraits<v_int16>::vlanes(), 16, i16, vmv_v_x_i16m1)
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits<v_uint32>::vlanes() / 2, VTraits<v_uint32>::vlanes(), 32, u32, vmv_v_x_u32m1)
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m1_t, int, VTraits<v_int32>::vlanes() / 2, VTraits<v_int32>::vlanes(), 32, i32, vmv_v_x_i32m1)
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits<v_uint64>::vlanes() / 2, VTraits<v_uint64>::vlanes(), 64, u64, vmv_v_x_u64m1)
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits<v_int64>::vlanes() / 2, VTraits<v_int64>::vlanes(), 64, i64, vmv_v_x_i64m1)
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits<v_float32>::vlanes() /2 , VTraits<v_float32>::vlanes(), 32, f32, vfmv_v_f_f32m1)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits<v_float64>::vlanes() / 2, VTraits<v_float64>::vlanes(), 64, f64, vfmv_v_f_f64m1)
#endif
////////////// Lookup table access ////////////////////
#define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
{ \
vuint32##suffix##_t vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
} \
inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
{ \
std::vector<uint> idx_; \
for (size_t i = 0; i < VTraits<v_int16>::vlanes(); ++i) { \
idx_.push_back(idx[i]); \
idx_.push_back(idx[i]+1); \
} \
vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
} \
inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
{ \
std::vector<uint> idx_; \
for (size_t i = 0; i < VTraits<v_int32>::vlanes(); ++i) { \
idx_.push_back(idx[i]); \
idx_.push_back(idx[i]+1); \
idx_.push_back(idx[i]+2); \
idx_.push_back(idx[i]+3); \
} \
vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
}
OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4)
OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2)
OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1)
OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2)
OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
inline v_uint16 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
inline v_uint16 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
inline v_uint16 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
inline v_uint32 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
inline v_uint64 v_lut_quads(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_quads((const int64_t*)tab, idx)); }
////////////// Min/Max //////////////
#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
{ \
return intrin(a, b, vl); \
}
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_min, vminu, VTraits<v_uint8>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_max, vmaxu, VTraits<v_uint8>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_min, vmin, VTraits<v_int8>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_max, vmax, VTraits<v_int8>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_min, vminu, VTraits<v_uint16>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_max, vmaxu, VTraits<v_uint16>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_min, vmin, VTraits<v_int16>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_max, vmax, VTraits<v_int16>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, vminu, VTraits<v_uint32>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, vmaxu, VTraits<v_uint32>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, vmin, VTraits<v_int32>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, vmax, VTraits<v_int32>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, vfmin, VTraits<v_float32>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, vfmax, VTraits<v_float32>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_min, vminu, VTraits<v_uint64>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_max, vmaxu, VTraits<v_uint64>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_min, vmin, VTraits<v_int64>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_max, vmax, VTraits<v_int64>::vlanes())
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, vfmin, VTraits<v_float64>::vlanes())
OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, vfmax, VTraits<v_float64>::vlanes())
#endif
//////////// Value reordering ////////////
#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tp, _Tpwvec, _Tpwvec_m2, _Tpvec, width, suffix, suffix2, cvt) \
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
{ \
_Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
b0 = vget_##suffix##m1(temp, 0); \
b1 = vget_##suffix##m1(temp, 1); \
} \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ \
_Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
return vget_##suffix##m1(temp, 0); \
} \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ \
_Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
return vget_##suffix##m1(temp, 1); \
} \
inline _Tpwvec v_load_expand(const _Tp* ptr) \
{ \
return cvt(vle##width##_v_##suffix2##mf2(ptr, vsetvlmax_e##width##m1()), vsetvlmax_e##width##m1()); \
}
OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m2_t, v_uint8, 8, u16, u8, vwcvtu_x)
OPENCV_HAL_IMPL_RVV_EXPAND(schar, v_int16, vint16m2_t, v_int8, 8, i16, i8, vwcvt_x)
OPENCV_HAL_IMPL_RVV_EXPAND(ushort, v_uint32, vuint32m2_t, v_uint16, 16, u32, u16, vwcvtu_x)
OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m2_t, v_int16, 16, i32, i16, vwcvt_x)
OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m2_t, v_uint32, 32, u64, u32, vwcvtu_x)
OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m2_t, v_int32, 32, i64, i32, vwcvt_x)
inline v_uint32 v_load_expand_q(const uchar* ptr)
{
return vwcvtu_x(vwcvtu_x(vle8_v_u8mf4(ptr, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
}
inline v_int32 v_load_expand_q(const schar* ptr)
{
return vwcvt_x(vwcvt_x(vle8_v_i8mf4(ptr, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
}
////// FP16 support ///////
inline v_float32 v_load_expand(const float16_t* ptr)
{
// TODO
return vundefined_f32m1();
}
inline void v_cleanup() {}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
} //namespace cv
#endif //OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP

View File

@ -128,8 +128,48 @@ template<> inline Type2Vec512_Traits<double>::vec_type v512_setall<double>(const
#endif // SIMD512 #endif // SIMD512
#if CV_SIMD_SCALABLE
template<typename _T> struct Type2Vec_Traits;
#define CV_INTRIN_DEF_TYPE2VEC_TRAITS(type_, vec_type_) \
template<> struct Type2Vec_Traits<type_> \
{ \
typedef vec_type_ vec_type; \
}
#if CV_SIMD_WIDTH == 16 CV_INTRIN_DEF_TYPE2VEC_TRAITS(uchar, v_uint8);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(schar, v_int8);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(ushort, v_uint16);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(short, v_int16);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(unsigned, v_uint32);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(int, v_int32);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(float, v_float32);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(uint64, v_uint64);
CV_INTRIN_DEF_TYPE2VEC_TRAITS(int64, v_int64);
#if CV_SIMD_SCALABLE_64F
CV_INTRIN_DEF_TYPE2VEC_TRAITS(double, v_float64);
#endif
template<typename _T> static inline
typename Type2Vec_Traits<_T>::vec_type v_setall(const _T& a);
template<> inline Type2Vec_Traits< uchar>::vec_type v_setall< uchar>(const uchar& a) { return v_setall_u8(a); }
template<> inline Type2Vec_Traits< schar>::vec_type v_setall< schar>(const schar& a) { return v_setall_s8(a); }
template<> inline Type2Vec_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
template<> inline Type2Vec_Traits< short>::vec_type v_setall< short>(const short& a) { return v_setall_s16(a); }
template<> inline Type2Vec_Traits< uint>::vec_type v_setall< uint>(const uint& a) { return v_setall_u32(a); }
template<> inline Type2Vec_Traits< int>::vec_type v_setall< int>(const int& a) { return v_setall_s32(a); }
template<> inline Type2Vec_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
template<> inline Type2Vec_Traits< int64>::vec_type v_setall< int64>(const int64& a) { return v_setall_s64(a); }
template<> inline Type2Vec_Traits< float>::vec_type v_setall< float>(const float& a) { return v_setall_f32(a); }
#if CV_SIMD_SCALABLE_64F
template<> inline Type2Vec_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
#endif
#endif
#if CV_SIMD_SCALABLE
template<typename _T> static inline
typename Type2Vec_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
#elif CV_SIMD_WIDTH == 16
template<typename _T> static inline template<typename _T> static inline
typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); } typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
#elif CV_SIMD_WIDTH == 32 #elif CV_SIMD_WIDTH == 32

View File

@ -266,24 +266,30 @@ struct op_absdiff
template<> template<>
struct op_absdiff<schar, v_int8> struct op_absdiff<schar, v_int8>
{ {
#if CV_SIMD
static inline v_int8 r(const v_int8& a, const v_int8& b) static inline v_int8 r(const v_int8& a, const v_int8& b)
{ return v_absdiffs(a, b); } { return v_absdiffs(a, b); }
#endif
static inline schar r(schar a, schar b) static inline schar r(schar a, schar b)
{ return c_absdiff(a, b); } { return c_absdiff(a, b); }
}; };
template<> template<>
struct op_absdiff<short, v_int16> struct op_absdiff<short, v_int16>
{ {
#if CV_SIMD
static inline v_int16 r(const v_int16& a, const v_int16& b) static inline v_int16 r(const v_int16& a, const v_int16& b)
{ return v_absdiffs(a, b); } { return v_absdiffs(a, b); }
#endif
static inline short r(short a, short b) static inline short r(short a, short b)
{ return c_absdiff(a, b); } { return c_absdiff(a, b); }
}; };
template<> template<>
struct op_absdiff<int, v_int32> struct op_absdiff<int, v_int32>
{ {
#if CV_SIMD
static inline v_int32 r(const v_int32& a, const v_int32& b) static inline v_int32 r(const v_int32& a, const v_int32& b)
{ return v_reinterpret_as_s32(v_absdiff(a, b)); } { return v_reinterpret_as_s32(v_absdiff(a, b)); }
#endif
static inline int r(int a, int b) static inline int r(int a, int b)
{ return c_absdiff(a, b); } { return c_absdiff(a, b); }
}; };
@ -1430,11 +1436,13 @@ struct op_mul
template<typename T1, typename T2, typename Tvec> template<typename T1, typename T2, typename Tvec>
struct op_mul_scale struct op_mul_scale
{ {
#if CV_SIMD
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{ {
const v_float32 v_scalar = vx_setall_f32(*scalar); const v_float32 v_scalar = vx_setall_f32(*scalar);
return v_scalar * a * b; return v_scalar * a * b;
} }
#endif
static inline T1 r(T1 a, T1 b, const T2* scalar) static inline T1 r(T1 a, T1 b, const T2* scalar)
{ return c_mul(a, b, *scalar); } { return c_mul(a, b, *scalar); }
static inline Tvec pre(const Tvec&, const Tvec& res) static inline Tvec pre(const Tvec&, const Tvec& res)
@ -1569,6 +1577,7 @@ struct op_div_f
template<typename T1, typename T2, typename Tvec> template<typename T1, typename T2, typename Tvec>
struct op_div_scale struct op_div_scale
{ {
#if CV_SIMD
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{ {
const v_float32 v_scalar = vx_setall_f32(*scalar); const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1579,6 +1588,7 @@ struct op_div_scale
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0); const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
return v_select(denom == v_zero, v_zero, res); return v_select(denom == v_zero, v_zero, res);
} }
#endif
static inline T1 r(T1 a, T1 denom, const T2* scalar) static inline T1 r(T1 a, T1 denom, const T2* scalar)
{ {
CV_StaticAssert(std::numeric_limits<T1>::is_integer, ""); CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
@ -1589,11 +1599,13 @@ struct op_div_scale
template<> template<>
struct op_div_scale<float, float, v_float32> struct op_div_scale<float, float, v_float32>
{ {
#if CV_SIMD
static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar) static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
{ {
const v_float32 v_scalar = vx_setall_f32(*scalar); const v_float32 v_scalar = vx_setall_f32(*scalar);
return a * v_scalar / b; return a * v_scalar / b;
} }
#endif
static inline float r(float a, float denom, const float* scalar) static inline float r(float a, float denom, const float* scalar)
{ return c_div(a, denom, *scalar); } { return c_div(a, denom, *scalar); }
}; };
@ -1673,11 +1685,13 @@ DEFINE_SIMD_ALL(div, div_loop)
template<typename T1, typename T2, typename Tvec> template<typename T1, typename T2, typename Tvec>
struct op_add_scale struct op_add_scale
{ {
#if CV_SIMD
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{ {
const v_float32 v_alpha = vx_setall_f32(*scalar); const v_float32 v_alpha = vx_setall_f32(*scalar);
return v_fma(a, v_alpha, b); return v_fma(a, v_alpha, b);
} }
#endif
static inline T1 r(T1 a, T1 b, const T2* scalar) static inline T1 r(T1 a, T1 b, const T2* scalar)
{ return c_add(a, b, *scalar); } { return c_add(a, b, *scalar); }
static inline Tvec pre(const Tvec&, const Tvec& res) static inline Tvec pre(const Tvec&, const Tvec& res)
@ -1704,6 +1718,7 @@ struct op_add_scale<double, double, v_float64>
template<typename T1, typename T2, typename Tvec> template<typename T1, typename T2, typename Tvec>
struct op_add_weighted struct op_add_weighted
{ {
#if CV_SIMD
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars) static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
{ {
const v_float32 v_alpha = vx_setall_f32(scalars[0]); const v_float32 v_alpha = vx_setall_f32(scalars[0]);
@ -1711,6 +1726,7 @@ struct op_add_weighted
const v_float32 v_gamma = vx_setall_f32(scalars[2]); const v_float32 v_gamma = vx_setall_f32(scalars[2]);
return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma)); return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma));
} }
#endif
static inline T1 r(T1 a, T1 b, const T2* scalars) static inline T1 r(T1 a, T1 b, const T2* scalars)
{ return c_add(a, b, scalars[0], scalars[1], scalars[2]); } { return c_add(a, b, scalars[0], scalars[1], scalars[2]); }
static inline Tvec pre(const Tvec&, const Tvec& res) static inline Tvec pre(const Tvec&, const Tvec& res)
@ -1819,6 +1835,7 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
template<typename T1, typename T2, typename Tvec> template<typename T1, typename T2, typename Tvec>
struct op_recip struct op_recip
{ {
#if CV_SIMD
static inline v_float32 r(const v_float32& a, const T2* scalar) static inline v_float32 r(const v_float32& a, const T2* scalar)
{ {
const v_float32 v_scalar = vx_setall_f32(*scalar); const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1829,6 +1846,7 @@ struct op_recip
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0); const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
return v_select(denom == v_zero, v_zero, res); return v_select(denom == v_zero, v_zero, res);
} }
#endif
static inline T1 r(T1 denom, const T2* scalar) static inline T1 r(T1 denom, const T2* scalar)
{ {
CV_StaticAssert(std::numeric_limits<T1>::is_integer, ""); CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
@ -1839,11 +1857,13 @@ struct op_recip
template<> template<>
struct op_recip<float, float, v_float32> struct op_recip<float, float, v_float32>
{ {
#if CV_SIMD
static inline v_float32 r(const v_float32& a, const float* scalar) static inline v_float32 r(const v_float32& a, const float* scalar)
{ {
const v_float32 v_scalar = vx_setall_f32(*scalar); const v_float32 v_scalar = vx_setall_f32(*scalar);
return v_scalar / a; return v_scalar / a;
} }
#endif
static inline float r(float denom, const float* scalar) static inline float r(float denom, const float* scalar)
{ return c_div(*scalar, denom); } { return c_div(*scalar, denom); }
}; };

View File

@ -7,7 +7,7 @@
#include "opencv2/core/hal/intrin.hpp" #include "opencv2/core/hal/intrin.hpp"
#undef CV__SIMD_FORCE_WIDTH #undef CV__SIMD_FORCE_WIDTH
#if CV_SIMD_WIDTH != 16 #if CV_SIMD && CV_SIMD_WIDTH != 16
#error "Invalid build configuration" #error "Invalid build configuration"
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -6,6 +6,8 @@
#if !defined(GAPI_STANDALONE) #if !defined(GAPI_STANDALONE)
#include <opencv2/core/hal/intrin.hpp>
#if CV_SIMD
#include "gfluidcore_func.hpp" #include "gfluidcore_func.hpp"
#include "gfluidcore_func.simd.hpp" #include "gfluidcore_func.simd.hpp"
@ -14,7 +16,6 @@
#include "gfluidutils.hpp" #include "gfluidutils.hpp"
#include <opencv2/core/cvdef.h> #include <opencv2/core/cvdef.h>
#include <opencv2/core/hal/intrin.hpp>
#include <cmath> #include <cmath>
#include <cstdlib> #include <cstdlib>
@ -394,5 +395,5 @@ CONVERTTO_SCALED_SIMD(float, float)
} // namespace fluid } // namespace fluid
} // namespace gapi } // namespace gapi
} // namespace cv } // namespace cv
#endif // CV_SIMD
#endif // !defined(GAPI_STANDALONE) #endif // !defined(GAPI_STANDALONE)

View File

@ -6,7 +6,7 @@
#pragma once #pragma once
#if !defined(GAPI_STANDALONE) #if !defined(GAPI_STANDALONE) && CV_SIMD
#include <opencv2/core.hpp> #include <opencv2/core.hpp>

View File

@ -3612,6 +3612,7 @@ struct Luv2RGBinteger
} }
} }
#if CV_SIMD
inline void processLuvToXYZ(const v_uint8& lv, const v_uint8& uv, const v_uint8& vv, inline void processLuvToXYZ(const v_uint8& lv, const v_uint8& uv, const v_uint8& vv,
v_int32 (&x)[4], v_int32 (&y)[4], v_int32 (&z)[4]) const v_int32 (&x)[4], v_int32 (&y)[4], v_int32 (&z)[4]) const
{ {
@ -3717,6 +3718,7 @@ struct Luv2RGBinteger
z[k] = v_max(zero, v_min(base2, z[k])); z[k] = v_max(zero, v_min(base2, z[k]));
} }
} }
#endif
void operator()(const uchar* src, uchar* dst, int n) const void operator()(const uchar* src, uchar* dst, int n) const
{ {

View File

@ -1038,6 +1038,7 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i
buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu; buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
} }
#if CV_SIMD
static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v, static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
v_int32 (&ruv)[4], v_int32 (&ruv)[4],
v_int32 (&guv)[4], v_int32 (&guv)[4],
@ -1067,6 +1068,7 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
buv[k] = vshift + ub * uu[k]; buv[k] = vshift + ub * uu[k];
} }
} }
#endif
static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, const int buv, static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, const int buv,
uchar& r, uchar& g, uchar& b, uchar& a) uchar& r, uchar& g, uchar& b, uchar& a)
@ -1079,6 +1081,7 @@ static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, co
a = uchar(0xff); a = uchar(0xff);
} }
#if CV_SIMD
static inline void yRGBuvToRGBA(const v_uint8& vy, static inline void yRGBuvToRGBA(const v_uint8& vy,
const v_int32 (&ruv)[4], const v_int32 (&ruv)[4],
const v_int32 (&guv)[4], const v_int32 (&guv)[4],
@ -1117,6 +1120,7 @@ static inline void yRGBuvToRGBA(const v_uint8& vy,
gg = v_pack_u(g0, g1); gg = v_pack_u(g0, g1);
bb = v_pack_u(b0, b1); bb = v_pack_u(b0, b1);
} }
#endif
template<int bIdx, int dcn, bool is420> template<int bIdx, int dcn, bool is420>
static inline void cvtYuv42xxp2RGB8(const uchar u, const uchar v, static inline void cvtYuv42xxp2RGB8(const uchar u, const uchar v,
@ -1426,6 +1430,7 @@ static inline uchar rgbToY42x(uchar r, uchar g, uchar b)
return saturate_cast<uchar>(yy >> ITUR_BT_601_SHIFT); return saturate_cast<uchar>(yy >> ITUR_BT_601_SHIFT);
} }
#if CV_SIMD
static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b) static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b)
{ {
const int shifted16 = (16 << ITUR_BT_601_SHIFT); const int shifted16 = (16 << ITUR_BT_601_SHIFT);
@ -1455,6 +1460,7 @@ static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint
return v_pack(y0, y1); return v_pack(y0, y1);
} }
#endif
static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v) static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
{ {
@ -1467,6 +1473,7 @@ static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
v = saturate_cast<uchar>(vv >> ITUR_BT_601_SHIFT); v = saturate_cast<uchar>(vv >> ITUR_BT_601_SHIFT);
} }
#if CV_SIMD
static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1, static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1,
const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v) const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v)
{ {
@ -1514,6 +1521,7 @@ static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint
u = v_pack_u(u0, u1); u = v_pack_u(u0, u1);
v = v_pack_u(v0, v1); v = v_pack_u(v0, v1);
} }
#endif
struct RGB8toYUV420pInvoker: public ParallelLoopBody struct RGB8toYUV420pInvoker: public ParallelLoopBody

View File

@ -497,7 +497,6 @@ struct MinMax8u
{ {
typedef uchar value_type; typedef uchar value_type;
typedef int arg_type; typedef int arg_type;
enum { SIZE = 1 };
arg_type load(const uchar* ptr) { return *ptr; } arg_type load(const uchar* ptr) { return *ptr; }
void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; } void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; }
void operator()(arg_type& a, arg_type& b) const void operator()(arg_type& a, arg_type& b) const
@ -511,7 +510,6 @@ struct MinMax16u
{ {
typedef ushort value_type; typedef ushort value_type;
typedef int arg_type; typedef int arg_type;
enum { SIZE = 1 };
arg_type load(const ushort* ptr) { return *ptr; } arg_type load(const ushort* ptr) { return *ptr; }
void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; } void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; }
void operator()(arg_type& a, arg_type& b) const void operator()(arg_type& a, arg_type& b) const
@ -526,7 +524,6 @@ struct MinMax16s
{ {
typedef short value_type; typedef short value_type;
typedef int arg_type; typedef int arg_type;
enum { SIZE = 1 };
arg_type load(const short* ptr) { return *ptr; } arg_type load(const short* ptr) { return *ptr; }
void store(short* ptr, arg_type val) { *ptr = (short)val; } void store(short* ptr, arg_type val) { *ptr = (short)val; }
void operator()(arg_type& a, arg_type& b) const void operator()(arg_type& a, arg_type& b) const
@ -541,7 +538,6 @@ struct MinMax32f
{ {
typedef float value_type; typedef float value_type;
typedef float arg_type; typedef float arg_type;
enum { SIZE = 1 };
arg_type load(const float* ptr) { return *ptr; } arg_type load(const float* ptr) { return *ptr; }
void store(float* ptr, arg_type val) { *ptr = val; } void store(float* ptr, arg_type val) { *ptr = val; }
void operator()(arg_type& a, arg_type& b) const void operator()(arg_type& a, arg_type& b) const
@ -552,14 +548,13 @@ struct MinMax32f
} }
}; };
#if CV_SIMD #if CV_SIMD || CV_SIMD_SCALABLE
struct MinMaxVec8u struct MinMaxVec8u
{ {
typedef uchar value_type; typedef uchar value_type;
typedef v_uint8x16 arg_type; typedef v_uint8 arg_type;
enum { SIZE = v_uint8x16::nlanes }; arg_type load(const uchar* ptr) { return vx_load(ptr); }
arg_type load(const uchar* ptr) { return v_load(ptr); }
void store(uchar* ptr, const arg_type &val) { v_store(ptr, val); } void store(uchar* ptr, const arg_type &val) { v_store(ptr, val); }
void operator()(arg_type& a, arg_type& b) const void operator()(arg_type& a, arg_type& b) const
{ {
@ -567,27 +562,14 @@ struct MinMaxVec8u
a = v_min(a, b); a = v_min(a, b);
b = v_max(b, t); b = v_max(b, t);
} }
#if CV_SIMD_WIDTH > 16
typedef v_uint8 warg_type;
enum { WSIZE = v_uint8::nlanes };
warg_type wload(const uchar* ptr) { return vx_load(ptr); }
void store(uchar* ptr, const warg_type &val) { v_store(ptr, val); }
void operator()(warg_type& a, warg_type& b) const
{
warg_type t = a;
a = v_min(a, b);
b = v_max(b, t);
}
#endif
}; };
struct MinMaxVec16u struct MinMaxVec16u
{ {
typedef ushort value_type; typedef ushort value_type;
typedef v_uint16x8 arg_type; typedef v_uint16 arg_type;
enum { SIZE = v_uint16x8::nlanes }; arg_type load(const ushort* ptr) { return vx_load(ptr); }
arg_type load(const ushort* ptr) { return v_load(ptr); }
void store(ushort* ptr, const arg_type &val) { v_store(ptr, val); } void store(ushort* ptr, const arg_type &val) { v_store(ptr, val); }
void operator()(arg_type& a, arg_type& b) const void operator()(arg_type& a, arg_type& b) const
{ {
@ -595,27 +577,14 @@ struct MinMaxVec16u
a = v_min(a, b); a = v_min(a, b);
b = v_max(b, t); b = v_max(b, t);
} }
#if CV_SIMD_WIDTH > 16
typedef v_uint16 warg_type;
enum { WSIZE = v_uint16::nlanes };
warg_type wload(const ushort* ptr) { return vx_load(ptr); }
void store(ushort* ptr, const warg_type &val) { v_store(ptr, val); }
void operator()(warg_type& a, warg_type& b) const
{
warg_type t = a;
a = v_min(a, b);
b = v_max(b, t);
}
#endif
}; };
struct MinMaxVec16s struct MinMaxVec16s
{ {
typedef short value_type; typedef short value_type;
typedef v_int16x8 arg_type; typedef v_int16 arg_type;
enum { SIZE = v_int16x8::nlanes }; arg_type load(const short* ptr) { return vx_load(ptr); }
arg_type load(const short* ptr) { return v_load(ptr); }
void store(short* ptr, const arg_type &val) { v_store(ptr, val); } void store(short* ptr, const arg_type &val) { v_store(ptr, val); }
void operator()(arg_type& a, arg_type& b) const void operator()(arg_type& a, arg_type& b) const
{ {
@ -623,27 +592,14 @@ struct MinMaxVec16s
a = v_min(a, b); a = v_min(a, b);
b = v_max(b, t); b = v_max(b, t);
} }
#if CV_SIMD_WIDTH > 16
typedef v_int16 warg_type;
enum { WSIZE = v_int16::nlanes };
warg_type wload(const short* ptr) { return vx_load(ptr); }
void store(short* ptr, const warg_type &val) { v_store(ptr, val); }
void operator()(warg_type& a, warg_type& b) const
{
warg_type t = a;
a = v_min(a, b);
b = v_max(b, t);
}
#endif
}; };
struct MinMaxVec32f struct MinMaxVec32f
{ {
typedef float value_type; typedef float value_type;
typedef v_float32x4 arg_type; typedef v_float32 arg_type;
enum { SIZE = v_float32x4::nlanes }; arg_type load(const float* ptr) { return vx_load(ptr); }
arg_type load(const float* ptr) { return v_load(ptr); }
void store(float* ptr, const arg_type &val) { v_store(ptr, val); } void store(float* ptr, const arg_type &val) { v_store(ptr, val); }
void operator()(arg_type& a, arg_type& b) const void operator()(arg_type& a, arg_type& b) const
{ {
@ -651,18 +607,6 @@ struct MinMaxVec32f
a = v_min(a, b); a = v_min(a, b);
b = v_max(b, t); b = v_max(b, t);
} }
#if CV_SIMD_WIDTH > 16
typedef v_float32 warg_type;
enum { WSIZE = v_float32::nlanes };
warg_type wload(const float* ptr) { return vx_load(ptr); }
void store(float* ptr, const warg_type &val) { v_store(ptr, val); }
void operator()(warg_type& a, warg_type& b) const
{
warg_type t = a;
a = v_min(a, b);
b = v_max(b, t);
}
#endif
}; };
#else #else
@ -683,9 +627,6 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
typedef typename Op::value_type T; typedef typename Op::value_type T;
typedef typename Op::arg_type WT; typedef typename Op::arg_type WT;
typedef typename VecOp::arg_type VT; typedef typename VecOp::arg_type VT;
#if CV_SIMD_WIDTH > 16
typedef typename VecOp::warg_type WVT;
#endif
const T* src = _src.ptr<T>(); const T* src = _src.ptr<T>();
T* dst = _dst.ptr<T>(); T* dst = _dst.ptr<T>();
@ -747,22 +688,12 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
if( limit == size.width ) if( limit == size.width )
break; break;
#if CV_SIMD_WIDTH > 16 #if CV_SIMD || CV_SIMD_SCALABLE
for( ; j <= size.width - VecOp::WSIZE - cn; j += VecOp::WSIZE ) int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
{ #else
WVT p0 = vop.wload(row0+j-cn), p1 = vop.wload(row0+j), p2 = vop.wload(row0+j+cn); int nlanes = 1;
WVT p3 = vop.wload(row1+j-cn), p4 = vop.wload(row1+j), p5 = vop.wload(row1+j+cn);
WVT p6 = vop.wload(row2+j-cn), p7 = vop.wload(row2+j), p8 = vop.wload(row2+j+cn);
vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
vop(p4, p2); vop(p6, p4); vop(p4, p2);
vop.store(dst+j, p4);
}
#endif #endif
for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE ) for( ; j <= size.width - nlanes - cn; j += nlanes )
{ {
VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn); VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn);
VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn); VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn);
@ -862,79 +793,43 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
if( limit == size.width ) if( limit == size.width )
break; break;
#if CV_SIMD_WIDTH > 16 #if CV_SIMD || CV_SIMD_SCALABLE
for( ; j <= size.width - VecOp::WSIZE - cn*2; j += VecOp::WSIZE ) int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
{ #else
WVT p[25]; int nlanes = 1;
for( k = 0; k < 5; k++ )
{
const T* rowk = row[k];
p[k*5] = vop.wload(rowk+j-cn*2); p[k*5+1] = vop.wload(rowk+j-cn);
p[k*5+2] = vop.wload(rowk+j); p[k*5+3] = vop.wload(rowk+j+cn);
p[k*5+4] = vop.wload(rowk+j+cn*2);
}
vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]);
vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]);
vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]);
vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]);
vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]);
vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]);
vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]);
vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]);
vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]);
vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]);
vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]);
vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]);
vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]);
vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]);
vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]);
vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]);
vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]);
vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]);
vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]);
vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]);
vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]);
vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]);
vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]);
vop.store(dst+j, p[12]);
}
#endif #endif
for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE ) for( ; j <= size.width - nlanes - cn*2; j += nlanes )
{ {
VT p[25]; VT p0 = vop.load(row[0]+j-cn*2), p5 = vop.load(row[1]+j-cn*2), p10 = vop.load(row[2]+j-cn*2), p15 = vop.load(row[3]+j-cn*2), p20 = vop.load(row[4]+j-cn*2);
for( k = 0; k < 5; k++ ) VT p1 = vop.load(row[0]+j-cn*1), p6 = vop.load(row[1]+j-cn*1), p11 = vop.load(row[2]+j-cn*1), p16 = vop.load(row[3]+j-cn*1), p21 = vop.load(row[4]+j-cn*1);
{ VT p2 = vop.load(row[0]+j-cn*0), p7 = vop.load(row[1]+j-cn*0), p12 = vop.load(row[2]+j-cn*0), p17 = vop.load(row[3]+j-cn*0), p22 = vop.load(row[4]+j-cn*0);
const T* rowk = row[k]; VT p3 = vop.load(row[0]+j+cn*1), p8 = vop.load(row[1]+j+cn*1), p13 = vop.load(row[2]+j+cn*1), p18 = vop.load(row[3]+j+cn*1), p23 = vop.load(row[4]+j+cn*1);
p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn); VT p4 = vop.load(row[0]+j+cn*2), p9 = vop.load(row[1]+j+cn*2), p14 = vop.load(row[2]+j+cn*2), p19 = vop.load(row[3]+j+cn*2), p24 = vop.load(row[4]+j+cn*2);
p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn);
p[k*5+4] = vop.load(rowk+j+cn*2);
}
vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]); vop(p1, p2); vop(p0, p1); vop(p1, p2); vop(p4, p5); vop(p3, p4);
vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]); vop(p4, p5); vop(p0, p3); vop(p2, p5); vop(p2, p3); vop(p1, p4);
vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]); vop(p1, p2); vop(p3, p4); vop(p7, p8); vop(p6, p7); vop(p7, p8);
vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]); vop(p10, p11); vop(p9, p10); vop(p10, p11); vop(p6, p9); vop(p8, p11);
vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]); vop(p8, p9); vop(p7, p10); vop(p7, p8); vop(p9, p10); vop(p0, p6);
vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]); vop(p4, p10); vop(p4, p6); vop(p2, p8); vop(p2, p4); vop(p6, p8);
vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]); vop(p1, p7); vop(p5, p11); vop(p5, p7); vop(p3, p9); vop(p3, p5);
vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]); vop(p7, p9); vop(p1, p2); vop(p3, p4); vop(p5, p6); vop(p7, p8);
vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]); vop(p9, p10); vop(p13, p14); vop(p12, p13); vop(p13, p14); vop(p16, p17);
vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]); vop(p15, p16); vop(p16, p17); vop(p12, p15); vop(p14, p17); vop(p14, p15);
vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]); vop(p13, p16); vop(p13, p14); vop(p15, p16); vop(p19, p20); vop(p18, p19);
vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p21, p23); vop(p22, p24);
vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]); vop(p22, p23); vop(p18, p21); vop(p20, p23); vop(p20, p21); vop(p19, p22);
vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]); vop(p22, p24); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p12, p18);
vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]); vop(p16, p22); vop(p16, p18); vop(p14, p20); vop(p20, p24); vop(p14, p16);
vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]); vop(p18, p20); vop(p22, p24); vop(p13, p19); vop(p17, p23); vop(p17, p19);
vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p15, p21); vop(p15, p17); vop(p19, p21); vop(p13, p14); vop(p15, p16);
vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]); vop(p17, p18); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p0, p12);
vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]); vop(p8, p20); vop(p8, p12); vop(p4, p16); vop(p16, p24); vop(p12, p16);
vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]); vop(p2, p14); vop(p10, p22); vop(p10, p14); vop(p6, p18); vop(p6, p10);
vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]); vop(p10, p12); vop(p1, p13); vop(p9, p21); vop(p9, p13); vop(p5, p17);
vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]); vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19);
vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]); vop(p7, p11); vop(p11, p13); vop(p11, p12);
vop.store(dst+j, p[12]); vop.store(dst+j, p12);
} }
limit = size.width; limit = size.width;

View File

@ -22,6 +22,10 @@ set(CMAKE_CXX_FLAGS "-march=rv64gcv --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
OPTION(RISCV_RVV_SCALABLE "Use scalable RVV API on RISC-V" ON) # Enabled by default
IF(RISCV_RVV_SCALABLE)
ADD_DEFINITIONS(-DCV_RVV_SCALABLE)
ENDIF()
set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT}) set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT})
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)