Merge pull request #21777 from anna-khakimova:ak/convertto_simd

GAPI Fluid: SIMD for ConvertTo.

* GAPI Fluid: SIMD for convertto.

* Applied comments
This commit is contained in:
Anna Khakimova 2022-03-30 00:14:01 +03:00 committed by GitHub
parent e4abf6e723
commit be38d4ea93
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 487 additions and 104 deletions

View File

@ -324,7 +324,7 @@ INSTANTIATE_TEST_CASE_P(ConvertToPerfTestFluid, ConvertToPerfTest,
Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
Values(CV_8U, CV_16U, CV_16S, CV_32F),
Values(szSmall128, szVGA, sz720p, sz1080p),
Values(2.5, 1.0),
Values(1.0, 2.5),
Values(0.0),
Values(cv::compile_args(CORE_FLUID))));

View File

@ -1555,102 +1555,43 @@ GAPI_FLUID_KERNEL(GFluidLUT, cv::gapi::core::GLUT, false)
//
//-------------------------
#if CV_SIMD128
template<typename DST, typename SRC>
CV_ALWAYS_INLINE int run_convertto_simd(DST*, const SRC*, int)
template<typename T>
CV_ALWAYS_INLINE void convertto_impl(const T in[], T out[], const int length)
{
return 0;
memcpy(out, in, length * sizeof(T));
}
CV_ALWAYS_INLINE int run_convertto_simd(uchar *out, const float *in, const int length)
template<typename SRC, typename DST>
CV_ALWAYS_INLINE void convertto_impl(const SRC in[], DST out[], const int length)
{
int l = 0;
for (; l <= length - 16; l += 16)
{
v_int32x4 i0, i1, i2, i3;
i0 = v_round( v_load( (float*)& in[l ] ) );
i1 = v_round( v_load( (float*)& in[l + 4] ) );
i2 = v_round( v_load( (float*)& in[l + 8] ) );
i3 = v_round( v_load( (float*)& in[l + 12] ) );
v_uint16x8 us0, us1;
us0 = v_pack_u(i0, i1);
us1 = v_pack_u(i2, i3);
v_uint8x16 uc;
uc = v_pack(us0, us1);
v_store((uchar*)& out[l], uc);
}
return l;
}
CV_ALWAYS_INLINE int run_convertto_simd(ushort *out, const float *in, const int length)
{
int l = 0;
for (; l <= length - 8; l += 8)
{
v_int32x4 i0, i1;
i0 = v_round( v_load( (float*)& in[l ] ) );
i1 = v_round( v_load( (float*)& in[l + 4] ) );
v_uint16x8 us;
us = v_pack_u(i0, i1);
v_store((ushort*)& out[l], us);
}
return l;
}
#endif
template<typename DST, typename SRC,
cv::util::enable_if_t<std::is_integral<DST>::value &&
std::is_floating_point<SRC>::value, bool> = true >
CV_ALWAYS_INLINE void run_convertto(DST *out, const SRC *in, const int length)
{
// manual SIMD if need rounding
static_assert(std::is_same<SRC,float>::value, "64-bit floating-point source is not supported");
int l = 0; // cycle index
#if CV_SIMD128
l = run_convertto_simd(out, in, length);
int x = 0;
#if CV_SIMD
x = convertto_simd(in, out, length);
#endif
// tail of SIMD cycle
for (; l < length; l++)
for (; x < length; ++x)
{
out[l] = saturate<DST>(in[l], rintf);
out[x] = saturate<DST>(in[x], rintf);
}
}
template<typename DST, typename SRC,
cv::util::enable_if_t<std::is_integral<DST>::value &&
std::is_integral<SRC>::value , bool> = true >
CV_ALWAYS_INLINE void run_convertto(DST *out, const SRC *in, const int length)
template<typename SRC, typename DST>
CV_ALWAYS_INLINE void convertto_impl(const SRC *in, DST* out, const float alpha, const float beta,
const int length)
{
for (int l = 0; l < length; l++)
int x = 0;
#if CV_SIMD
x = convertto_scaled_simd(in, out, alpha, beta, length);
#endif
for (; x < length; ++x)
{
out[l] = saturate<DST>(in[l]);
}
}
template<typename DST, typename SRC,
cv::util::enable_if_t<std::is_floating_point<DST>::value, bool> = true >
CV_ALWAYS_INLINE void run_convertto(DST *out, const SRC *in, const int length)
{
static_assert(!std::is_same<SRC,double>::value, "64-bit floating-point source is not supported");
for (int l = 0; l < length; l++)
{
out[l] = static_cast<DST>(in[l]);
out[x] = saturate<DST>(in[x] * alpha + beta, rintf);
}
}
template<typename DST, typename SRC>
CV_ALWAYS_INLINE void run_convertto(DST *out, const SRC *in, const float alpha, const float beta,
const int length)
{
static_assert(!std::is_same<SRC,double>::value, "64-bit floating-point source is not supported");
// TODO: optimize if alpha and beta and data are integral
for (int l = 0; l < length; l++)
{
out[l] = saturate<DST>(in[l] * alpha + beta, rintf);
}
}
template<typename DST, typename SRC>
static void run_convertto(Buffer &dst, const View &src, double _alpha, double _beta)
CV_ALWAYS_INLINE void run_convertto(Buffer &dst, const View &src, double _alpha, double _beta)
{
const auto *in = src.InLine<SRC>(0);
auto *out = dst.OutLine<DST>();
@ -1664,13 +1605,13 @@ static void run_convertto(Buffer &dst, const View &src, double _alpha, double _b
const auto beta = static_cast<float>( _beta );
// compute faster if no alpha no beta
if (1.f == alpha && 0.f == beta)
if ((std::fabs(alpha - 1.f) < FLT_EPSILON) && (std::fabs(beta) < FLT_EPSILON))
{
run_convertto(out, in, length);
convertto_impl(in, out, length);
}
else // if alpha or beta is non-trivial
{
run_convertto(out, in, alpha, beta, length);
convertto_impl(in, out, alpha, beta, length);
}
}
@ -1681,22 +1622,22 @@ GAPI_FLUID_KERNEL(GFluidConvertTo, cv::gapi::core::GConvertTo, false)
static void run(const View &src, int /*rtype*/, double alpha, double beta, Buffer &dst)
{
// DST SRC OP __VA_ARGS__
UNARY_(uchar , uchar , run_convertto, dst, src, alpha, beta);
UNARY_(uchar , ushort, run_convertto, dst, src, alpha, beta);
UNARY_(uchar , short, run_convertto, dst, src, alpha, beta);
UNARY_(uchar , float, run_convertto, dst, src, alpha, beta);
UNARY_(uchar, uchar , run_convertto, dst, src, alpha, beta);
UNARY_(uchar, ushort, run_convertto, dst, src, alpha, beta);
UNARY_(uchar, short, run_convertto, dst, src, alpha, beta);
UNARY_(uchar, float, run_convertto, dst, src, alpha, beta);
UNARY_(ushort, uchar , run_convertto, dst, src, alpha, beta);
UNARY_(ushort, ushort, run_convertto, dst, src, alpha, beta);
UNARY_(ushort, short, run_convertto, dst, src, alpha, beta);
UNARY_(ushort, float, run_convertto, dst, src, alpha, beta);
UNARY_( short, uchar , run_convertto, dst, src, alpha, beta);
UNARY_( short, ushort, run_convertto, dst, src, alpha, beta);
UNARY_( short, short, run_convertto, dst, src, alpha, beta);
UNARY_( short, float, run_convertto, dst, src, alpha, beta);
UNARY_( float, uchar , run_convertto, dst, src, alpha, beta);
UNARY_( float, ushort, run_convertto, dst, src, alpha, beta);
UNARY_( float, short, run_convertto, dst, src, alpha, beta);
UNARY_( float, float, run_convertto, dst, src, alpha, beta);
UNARY_(short, uchar , run_convertto, dst, src, alpha, beta);
UNARY_(short, ushort, run_convertto, dst, src, alpha, beta);
UNARY_(short, short, run_convertto, dst, src, alpha, beta);
UNARY_(short, float, run_convertto, dst, src, alpha, beta);
UNARY_(float, uchar , run_convertto, dst, src, alpha, beta);
UNARY_(float, ushort, run_convertto, dst, src, alpha, beta);
UNARY_(float, short, run_convertto, dst, src, alpha, beta);
UNARY_(float, float, run_convertto, dst, src, alpha, beta);
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}

View File

@ -293,9 +293,8 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
#define ADD_SIMD(SRC, DST) \
int add_simd(const SRC in1[], const SRC in2[], DST out[], const int length) \
{ \
\
CV_CPU_DISPATCH(add_simd, (in1, in2, out, length), \
CV_CPU_DISPATCH_MODES_ALL); \
CV_CPU_DISPATCH(add_simd, (in1, in2, out, length), \
CV_CPU_DISPATCH_MODES_ALL); \
}
ADD_SIMD(uchar, uchar)
@ -320,9 +319,8 @@ ADD_SIMD(float, float)
#define SUB_SIMD(SRC, DST) \
int sub_simd(const SRC in1[], const SRC in2[], DST out[], const int length) \
{ \
\
CV_CPU_DISPATCH(sub_simd, (in1, in2, out, length), \
CV_CPU_DISPATCH_MODES_ALL); \
CV_CPU_DISPATCH(sub_simd, (in1, in2, out, length), \
CV_CPU_DISPATCH_MODES_ALL); \
}
SUB_SIMD(uchar, uchar)
@ -344,6 +342,55 @@ SUB_SIMD(float, float)
#undef SUB_SIMD
#define CONVERTTO_NOCOEF_SIMD(SRC, DST) \
int convertto_simd(const SRC in[], DST out[], const int length) \
{ \
CV_CPU_DISPATCH(convertto_simd, (in, out, length), \
CV_CPU_DISPATCH_MODES_ALL); \
}
CONVERTTO_NOCOEF_SIMD(ushort, uchar)
CONVERTTO_NOCOEF_SIMD(short, uchar)
CONVERTTO_NOCOEF_SIMD(float, uchar)
CONVERTTO_NOCOEF_SIMD(ushort, short)
CONVERTTO_NOCOEF_SIMD(uchar, short)
CONVERTTO_NOCOEF_SIMD(float, short)
CONVERTTO_NOCOEF_SIMD(uchar, ushort)
CONVERTTO_NOCOEF_SIMD(short, ushort)
CONVERTTO_NOCOEF_SIMD(float, ushort)
CONVERTTO_NOCOEF_SIMD(uchar, float)
CONVERTTO_NOCOEF_SIMD(ushort, float)
CONVERTTO_NOCOEF_SIMD(short, float)
#undef CONVERTTO_NOCOEF_SIMD
#define CONVERTTO_SCALED_SIMD(SRC, DST) \
int convertto_scaled_simd(const SRC in[], DST out[], const float alpha, \
const float beta, const int length) \
{ \
CV_CPU_DISPATCH(convertto_scaled_simd, (in, out, alpha, beta, length), \
CV_CPU_DISPATCH_MODES_ALL); \
}
CONVERTTO_SCALED_SIMD(uchar, uchar)
CONVERTTO_SCALED_SIMD(ushort, uchar)
CONVERTTO_SCALED_SIMD(short, uchar)
CONVERTTO_SCALED_SIMD(float, uchar)
CONVERTTO_SCALED_SIMD(short, short)
CONVERTTO_SCALED_SIMD(ushort, short)
CONVERTTO_SCALED_SIMD(uchar, short)
CONVERTTO_SCALED_SIMD(float, short)
CONVERTTO_SCALED_SIMD(ushort, ushort)
CONVERTTO_SCALED_SIMD(uchar, ushort)
CONVERTTO_SCALED_SIMD(short, ushort)
CONVERTTO_SCALED_SIMD(float, ushort)
CONVERTTO_SCALED_SIMD(uchar, float)
CONVERTTO_SCALED_SIMD(ushort, float)
CONVERTTO_SCALED_SIMD(short, float)
CONVERTTO_SCALED_SIMD(float, float)
#undef CONVERTTO_SCALED_SIMD
} // namespace fluid
} // namespace gapi
} // namespace cv

View File

@ -266,6 +266,47 @@ SUB_SIMD(float, float)
#undef SUB_SIMD
#define CONVERTTO_NOCOEF_SIMD(SRC, DST) \
int convertto_simd(const SRC in[], DST out[], const int length);
CONVERTTO_NOCOEF_SIMD(ushort, uchar)
CONVERTTO_NOCOEF_SIMD(short, uchar)
CONVERTTO_NOCOEF_SIMD(float, uchar)
CONVERTTO_NOCOEF_SIMD(ushort, short)
CONVERTTO_NOCOEF_SIMD(uchar, short)
CONVERTTO_NOCOEF_SIMD(float, short)
CONVERTTO_NOCOEF_SIMD(uchar, ushort)
CONVERTTO_NOCOEF_SIMD(short, ushort)
CONVERTTO_NOCOEF_SIMD(float, ushort)
CONVERTTO_NOCOEF_SIMD(uchar, float)
CONVERTTO_NOCOEF_SIMD(ushort, float)
CONVERTTO_NOCOEF_SIMD(short, float)
#undef CONVERTTO_NOCOEF_SIMD
#define CONVERTTO_SCALED_SIMD(SRC, DST) \
int convertto_scaled_simd(const SRC in[], DST out[], const float alpha, \
const float beta, const int length);
CONVERTTO_SCALED_SIMD(uchar, uchar)
CONVERTTO_SCALED_SIMD(ushort, uchar)
CONVERTTO_SCALED_SIMD(short, uchar)
CONVERTTO_SCALED_SIMD(float, uchar)
CONVERTTO_SCALED_SIMD(short, short)
CONVERTTO_SCALED_SIMD(ushort, short)
CONVERTTO_SCALED_SIMD(uchar, short)
CONVERTTO_SCALED_SIMD(float, short)
CONVERTTO_SCALED_SIMD(ushort, ushort)
CONVERTTO_SCALED_SIMD(uchar, ushort)
CONVERTTO_SCALED_SIMD(short, ushort)
CONVERTTO_SCALED_SIMD(float, ushort)
CONVERTTO_SCALED_SIMD(uchar, float)
CONVERTTO_SCALED_SIMD(ushort, float)
CONVERTTO_SCALED_SIMD(short, float)
CONVERTTO_SCALED_SIMD(float, float)
#undef CONVERTTO_SCALED_SIMD
} // namespace fluid
} // namespace gapi
} // namespace cv

View File

@ -275,6 +275,47 @@ SUB_SIMD(float, float)
#undef SUB_SIMD
#define CONVERTTO_NOCOEF_SIMD(SRC, DST) \
int convertto_simd(const SRC in[], DST out[], const int length);
CONVERTTO_NOCOEF_SIMD(ushort, uchar)
CONVERTTO_NOCOEF_SIMD(short, uchar)
CONVERTTO_NOCOEF_SIMD(float, uchar)
CONVERTTO_NOCOEF_SIMD(ushort, short)
CONVERTTO_NOCOEF_SIMD(uchar, short)
CONVERTTO_NOCOEF_SIMD(float, short)
CONVERTTO_NOCOEF_SIMD(uchar, ushort)
CONVERTTO_NOCOEF_SIMD(short, ushort)
CONVERTTO_NOCOEF_SIMD(float, ushort)
CONVERTTO_NOCOEF_SIMD(uchar, float)
CONVERTTO_NOCOEF_SIMD(ushort, float)
CONVERTTO_NOCOEF_SIMD(short, float)
#undef CONVERTTO_NOCOEF_SIMD
#define CONVERTTO_SCALED_SIMD(SRC, DST) \
int convertto_scaled_simd(const SRC in[], DST out[], const float alpha, \
const float beta, const int length);
CONVERTTO_SCALED_SIMD(uchar, uchar)
CONVERTTO_SCALED_SIMD(ushort, uchar)
CONVERTTO_SCALED_SIMD(short, uchar)
CONVERTTO_SCALED_SIMD(float, uchar)
CONVERTTO_SCALED_SIMD(short, short)
CONVERTTO_SCALED_SIMD(ushort, short)
CONVERTTO_SCALED_SIMD(uchar, short)
CONVERTTO_SCALED_SIMD(float, short)
CONVERTTO_SCALED_SIMD(ushort, ushort)
CONVERTTO_SCALED_SIMD(uchar, ushort)
CONVERTTO_SCALED_SIMD(short, ushort)
CONVERTTO_SCALED_SIMD(float, ushort)
CONVERTTO_SCALED_SIMD(uchar, float)
CONVERTTO_SCALED_SIMD(ushort, float)
CONVERTTO_SCALED_SIMD(short, float)
CONVERTTO_SCALED_SIMD(float, float)
#undef CONVERTTO_SCALED_SIMD
int split3_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], const int width);
@ -289,6 +330,11 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#define SRC_SHORT_OR_USHORT std::is_same<SRC, short>::value || std::is_same<SRC, ushort>::value
#define DST_SHORT_OR_USHORT std::is_same<DST, short>::value || std::is_same<DST, ushort>::value
#define SRC_DST_SHORT_AND_USHORT (std::is_same<SRC, short>::value && std::is_same<DST, ushort>::value) || (std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value)
#define SRC_DST_SHORT_OR_USHORT (std::is_same<SRC, short>::value && std::is_same<DST, short>::value) || (std::is_same<SRC, ushort>::value && std::is_same<DST, ushort>::value)
struct scale_tag {};
struct not_scale_tag {};
@ -2778,6 +2824,314 @@ SUB_SIMD(float, float)
#undef SUB_SIMD
//-------------------------
//
// Fluid kernels: ConvertTo
//
//-------------------------
CV_ALWAYS_INLINE void store_i16(ushort* outx, const v_uint16& res)
{
vx_store(outx, res);
}
CV_ALWAYS_INLINE void store_i16(short* outx, const v_uint16& res)
{
vx_store(outx, v_reinterpret_as_s16(res));
}
CV_ALWAYS_INLINE void store_i16(ushort* outx, const v_int16& res)
{
vx_store(outx, v_reinterpret_as_u16(res));
}
CV_ALWAYS_INLINE void store_i16(short* outx, const v_int16& res)
{
vx_store(outx, res);
}
CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const float* inx, uchar* outx)
{
constexpr int nlanes = v_uint8::nlanes;
v_int32 a1 = v_round(vx_load(inx));
v_int32 a2 = v_round(vx_load(&inx[nlanes/4]));
v_int32 a3 = v_round(vx_load(&inx[nlanes/2]));
v_int32 a4 = v_round(vx_load(&inx[3*nlanes/4]));
v_int16 r1 = v_pack(a1, a2);
v_int16 r2 = v_pack(a3, a4);
vx_store(outx, v_pack_u(r1, r2));
}
template<typename SRC>
CV_ALWAYS_INLINE
typename std::enable_if<SRC_SHORT_OR_USHORT, void>::type
convertto_simd_nocoeff_impl(const SRC* inx, uchar* outx)
{
constexpr int nlanes = v_uint8::nlanes;
vector_type_of_t<SRC> a1 = vx_load(inx);
vector_type_of_t<SRC> a2 = vx_load(&inx[nlanes/2]);
pack_store_uchar(outx, a1, a2);
}
//---------------------------------------------------------------------------------------
template<typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<DST_SHORT_OR_USHORT, void>::type
convertto_simd_nocoeff_impl(const float* inx, DST* outx)
{
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
v_int32 a1 = v_round(vx_load(inx));
v_int32 a2 = v_round(vx_load(&inx[nlanes/2]));
v_store_i16(outx, a1, a2);
}
template<typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<DST_SHORT_OR_USHORT, void>::type
convertto_simd_nocoeff_impl(const uchar* inx, DST* outx)
{
v_uint8 a = vx_load(inx);
v_uint16 res = v_expand_low(a);
store_i16(outx, res);
}
template<typename SRC, typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<SRC_DST_SHORT_AND_USHORT, void>::type
convertto_simd_nocoeff_impl(const SRC* inx, DST* outx)
{
vector_type_of_t<SRC> a = vx_load(inx);
store_i16(outx, a);
}
//---------------------------------------------------------------------------------------
template<typename SRC>
CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const SRC* inx, float* outx)
{
v_float32 a = vg_load_f32(inx);
vx_store(outx, a);
}
#define CONVERTTO_NOCOEF_SIMD(SRC, DST) \
int convertto_simd(const SRC in[], DST out[], const int length) \
{ \
constexpr int nlanes = vector_type_of_t<DST>::nlanes; \
\
int x = 0; \
for (;;) \
{ \
for (; x <= length - nlanes; x += nlanes) \
{ \
convertto_simd_nocoeff_impl(&in[x], &out[x]); \
} \
if (x < length) \
{ \
x = length - nlanes; \
continue; \
} \
break; \
} \
return x; \
}
CONVERTTO_NOCOEF_SIMD(ushort, uchar)
CONVERTTO_NOCOEF_SIMD(short, uchar)
CONVERTTO_NOCOEF_SIMD(float, uchar)
CONVERTTO_NOCOEF_SIMD(ushort, short)
CONVERTTO_NOCOEF_SIMD(uchar, short)
CONVERTTO_NOCOEF_SIMD(float, short)
CONVERTTO_NOCOEF_SIMD(uchar, ushort)
CONVERTTO_NOCOEF_SIMD(short, ushort)
CONVERTTO_NOCOEF_SIMD(float, ushort)
CONVERTTO_NOCOEF_SIMD(uchar, float)
CONVERTTO_NOCOEF_SIMD(ushort, float)
CONVERTTO_NOCOEF_SIMD(short, float)
#undef CONVERTTO_NOCOEF_SIMD
CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const float* inx, uchar* outx,
const v_float32& v_alpha,
const v_float32& v_beta)
{
constexpr int nlanes = v_uint8::nlanes;
v_float32 a1 = vx_load(inx);
v_float32 a2 = vx_load(&inx[nlanes / 4]);
v_float32 a3 = vx_load(&inx[nlanes / 2]);
v_float32 a4 = vx_load(&inx[3 * nlanes / 4]);
v_int32 r1 = v_round(v_fma(a1, v_alpha, v_beta));
v_int32 r2 = v_round(v_fma(a2, v_alpha, v_beta));
v_int32 r3 = v_round(v_fma(a3, v_alpha, v_beta));
v_int32 r4 = v_round(v_fma(a4, v_alpha, v_beta));
vx_store(outx, v_pack_u(v_pack(r1, r2), v_pack(r3, r4)));
}
template<typename SRC>
CV_ALWAYS_INLINE
typename std::enable_if<SRC_SHORT_OR_USHORT, void>::type
convertto_scaled_simd_impl(const SRC* inx, uchar* outx, const v_float32& v_alpha,
const v_float32& v_beta)
{
constexpr int nlanes = v_uint8::nlanes;
v_int16 a = v_reinterpret_as_s16(vx_load(inx));
v_int16 b = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2]));
v_float32 a1 = v_cvt_f32(v_expand_low(a));
v_float32 a2 = v_cvt_f32(v_expand_high(a));
v_float32 b1 = v_cvt_f32(v_expand_low(b));
v_float32 b2 = v_cvt_f32(v_expand_high(b));
v_int32 r1 = v_round(v_fma(a1, v_alpha, v_beta));
v_int32 r2 = v_round(v_fma(a2, v_alpha, v_beta));
v_int32 r3 = v_round(v_fma(b1, v_alpha, v_beta));
v_int32 r4 = v_round(v_fma(b2, v_alpha, v_beta));
vx_store(outx, v_pack_u(v_pack(r1, r2), v_pack(r3, r4)));
}
CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const uchar* inx, uchar* outx,
const v_float32& v_alpha,
const v_float32& v_beta)
{
v_uint8 a = vx_load(inx);
v_int16 a1 = v_reinterpret_as_s16(v_expand_low(a));
v_int16 a2 = v_reinterpret_as_s16(v_expand_high(a));
v_float32 f1 = v_cvt_f32(v_expand_low(a1));
v_float32 f2 = v_cvt_f32(v_expand_high(a1));
v_float32 f3 = v_cvt_f32(v_expand_low(a2));
v_float32 f4 = v_cvt_f32(v_expand_high(a2));
v_int32 r1 = v_round(v_fma(f1, v_alpha, v_beta));
v_int32 r2 = v_round(v_fma(f2, v_alpha, v_beta));
v_int32 r3 = v_round(v_fma(f3, v_alpha, v_beta));
v_int32 r4 = v_round(v_fma(f4, v_alpha, v_beta));
vx_store(outx, v_pack_u(v_pack(r1, r2), v_pack(r3, r4)));
}
template<typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<DST_SHORT_OR_USHORT, void>::type
convertto_scaled_simd_impl(const float* inx, DST* outx,
const v_float32& v_alpha,
const v_float32& v_beta)
{
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
v_float32 a1 = vx_load(inx);
v_float32 a2 = vx_load(&inx[nlanes / 2]);
v_int32 r1 = v_round(v_fma(a1, v_alpha, v_beta));
v_int32 r2 = v_round(v_fma(a2, v_alpha, v_beta));
v_store_i16(outx, r1, r2);
}
template<typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<DST_SHORT_OR_USHORT, void>::type
convertto_scaled_simd_impl(const uchar* inx, DST* outx,
const v_float32& v_alpha,
const v_float32& v_beta)
{
v_int16 a = v_reinterpret_as_s16(vx_load_expand(inx));
v_float32 a1 = v_cvt_f32(v_expand_low(a));
v_float32 a2 = v_cvt_f32(v_expand_high(a));
v_int32 r1 = v_round(v_fma(a1, v_alpha, v_beta));
v_int32 r2 = v_round(v_fma(a2, v_alpha, v_beta));
v_store_i16(outx, r1, r2);
}
template<typename SRC, typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<SRC_DST_SHORT_AND_USHORT ||
SRC_DST_SHORT_OR_USHORT, void>::type
convertto_scaled_simd_impl(const SRC* inx, DST* outx,
const v_float32& v_alpha,
const v_float32& v_beta)
{
v_int16 a = v_reinterpret_as_s16(vx_load(inx));
v_float32 a1 = v_cvt_f32(v_expand_low(a));
v_float32 a2 = v_cvt_f32(v_expand_high(a));
v_int32 r1 = v_round(v_fma(a1, v_alpha, v_beta));
v_int32 r2 = v_round(v_fma(a2, v_alpha, v_beta));
v_store_i16(outx, r1, r2);
}
template<typename SRC>
CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const SRC* inx, float* outx,
const v_float32& v_alpha,
const v_float32& v_beta)
{
v_float32 a = vg_load_f32(inx);
vx_store(outx, v_fma(a, v_alpha, v_beta));
}
#define CONVERTTO_SCALED_SIMD(SRC, DST) \
int convertto_scaled_simd(const SRC in[], DST out[], const float alpha, \
const float beta, const int length) \
{ \
constexpr int nlanes = vector_type_of_t<DST>::nlanes; \
v_float32 v_alpha = vx_setall_f32(alpha); \
v_float32 v_beta = vx_setall_f32(beta); \
\
int x = 0; \
for (;;) \
{ \
for (; x <= length - nlanes; x += nlanes) \
{ \
convertto_scaled_simd_impl(&in[x], &out[x], v_alpha, v_beta); \
} \
if (x < length) \
{ \
x = length - nlanes; \
continue; \
} \
break; \
} \
return x; \
}
CONVERTTO_SCALED_SIMD(uchar, uchar)
CONVERTTO_SCALED_SIMD(ushort, uchar)
CONVERTTO_SCALED_SIMD(short, uchar)
CONVERTTO_SCALED_SIMD(float, uchar)
CONVERTTO_SCALED_SIMD(short, short)
CONVERTTO_SCALED_SIMD(ushort, short)
CONVERTTO_SCALED_SIMD(uchar, short)
CONVERTTO_SCALED_SIMD(float, short)
CONVERTTO_SCALED_SIMD(ushort, ushort)
CONVERTTO_SCALED_SIMD(uchar, ushort)
CONVERTTO_SCALED_SIMD(short, ushort)
CONVERTTO_SCALED_SIMD(float, ushort)
CONVERTTO_SCALED_SIMD(uchar, float)
CONVERTTO_SCALED_SIMD(ushort, float)
CONVERTTO_SCALED_SIMD(short, float)
CONVERTTO_SCALED_SIMD(float, float)
#undef CONVERTTO_SCALED_SIMD
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END