mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 06:26:29 +08:00
Merge pull request #11169 from tomoaki0705:universalRemap
This commit is contained in:
commit
cfaca4327b
@ -795,7 +795,7 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
|
||||
/** @brief Multiply and add
|
||||
|
||||
Returns \f$ a*b + c \f$
|
||||
For floating point types only. */
|
||||
For floating point types and signed 32bit int only. */
|
||||
template<typename _Tp, int n>
|
||||
inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
||||
const v_reg<_Tp, n>& c)
|
||||
@ -828,6 +828,29 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Dot product of elements
|
||||
|
||||
Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
|
||||
Scheme:
|
||||
@code
|
||||
{A1 A2 ...} // 16-bit
|
||||
x {B1 B2 ...} // 16-bit
|
||||
-------------
|
||||
{A1B1+A2B2+C1 ...} // 32-bit
|
||||
|
||||
@endcode
|
||||
Implemented only for 16-bit signed source type (v_int16x8).
|
||||
*/
|
||||
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
|
||||
v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
|
||||
{
|
||||
typedef typename V_TypeTraits<_Tp>::w_type w_type;
|
||||
v_reg<w_type, n/2> s;
|
||||
for( int i = 0; i < (n/2); i++ )
|
||||
s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
|
||||
return s;
|
||||
}
|
||||
|
||||
/** @brief Multiply and expand
|
||||
|
||||
Multiply values two registers and store results in two registers with wider pack type.
|
||||
|
@ -506,6 +506,12 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
|
||||
return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{
|
||||
v_int32x4 s = v_dotprod(a, b);
|
||||
return v_int32x4(vaddq_s32(s.val , c.val));
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
|
||||
OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
|
||||
@ -730,6 +736,11 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_
|
||||
return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
{
|
||||
return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
|
||||
}
|
||||
|
||||
#if CV_SIMD128_64F
|
||||
inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
|
||||
{
|
||||
@ -1095,6 +1106,18 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
|
||||
OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
|
||||
#endif
|
||||
|
||||
#if CV_SIMD128_64F
|
||||
inline v_int32x4 v_round(const v_float32x4& a)
|
||||
{
|
||||
float32x4_t a_ = a.val;
|
||||
int32x4_t result;
|
||||
__asm__ ("fcvtns %0.4s, %1.4s"
|
||||
: "=w"(result)
|
||||
: "w"(a_)
|
||||
: /* No clobbers */);
|
||||
return v_int32x4(result);
|
||||
}
|
||||
#else
|
||||
inline v_int32x4 v_round(const v_float32x4& a)
|
||||
{
|
||||
static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
|
||||
@ -1103,7 +1126,7 @@ inline v_int32x4 v_round(const v_float32x4& a)
|
||||
int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
|
||||
return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
|
||||
}
|
||||
|
||||
#endif
|
||||
inline v_int32x4 v_floor(const v_float32x4& a)
|
||||
{
|
||||
int32x4_t a1 = vcvtq_s32_f32(a.val);
|
||||
|
@ -710,6 +710,11 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
|
||||
return v_int32x4(_mm_madd_epi16(a.val, b.val));
|
||||
}
|
||||
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{
|
||||
return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
|
||||
OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
|
||||
@ -954,6 +959,10 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
|
||||
__m128i m = _mm_cmpgt_epi32(b.val, a.val);
|
||||
return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
|
||||
}
|
||||
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
{
|
||||
return a * b + c;
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
|
||||
inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
|
||||
@ -1632,7 +1641,7 @@ inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2&
|
||||
c = v_reinterpret_as_f64(t2);
|
||||
}
|
||||
|
||||
// 2-channel, float only
|
||||
// 2-channel
|
||||
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
@ -1644,7 +1653,29 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
|
||||
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
|
||||
}
|
||||
|
||||
inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b )
|
||||
inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b)
|
||||
{
|
||||
__m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
|
||||
__m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
|
||||
|
||||
__m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
|
||||
__m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
|
||||
__m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
|
||||
__m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
|
||||
|
||||
a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
|
||||
b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
|
||||
}
|
||||
|
||||
inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b)
|
||||
{
|
||||
v_int16x8 sa, sb;
|
||||
v_load_deinterleave((const short*)ptr, sa, sb);
|
||||
a = v_reinterpret_as_u16(sa);
|
||||
b = v_reinterpret_as_u16(sb);
|
||||
}
|
||||
|
||||
inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
__m128i t0, t1;
|
||||
t0 = _mm_unpacklo_epi16(a.val, b.val);
|
||||
|
@ -760,6 +760,9 @@ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
|
||||
OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
|
||||
OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
|
||||
|
||||
inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
|
||||
{ return a * b + c; }
|
||||
|
||||
// TODO: exp, log, sin, cos
|
||||
|
||||
/** Absolute values **/
|
||||
@ -843,6 +846,9 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
|
||||
{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
|
||||
|
||||
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
|
||||
{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
|
||||
|
||||
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
|
||||
const v_float32x4& m1, const v_float32x4& m2,
|
||||
const v_float32x4& m3)
|
||||
|
@ -521,15 +521,25 @@ template<typename R> struct TheTest
|
||||
TheTest & test_dot_prod()
|
||||
{
|
||||
typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
|
||||
typedef typename Rx2::lane_type w_type;
|
||||
|
||||
Data<R> dataA, dataB(2);
|
||||
R a = dataA, b = dataB;
|
||||
|
||||
Data<Rx2> res = v_dotprod(a, b);
|
||||
Data<Rx2> dataC;
|
||||
dataC += std::numeric_limits<w_type>::is_signed ?
|
||||
std::numeric_limits<w_type>::min() :
|
||||
std::numeric_limits<w_type>::max() - R::nlanes * (dataB[0] + 1);
|
||||
Rx2 c = dataC;
|
||||
|
||||
Data<Rx2> resD = v_dotprod(a, b),
|
||||
resE = v_dotprod(a, b, c);
|
||||
|
||||
const int n = R::nlanes / 2;
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], res[i]);
|
||||
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]);
|
||||
EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
@ -229,7 +229,7 @@ OCL_PERF_TEST_P(RemapFixture, Remap,
|
||||
|
||||
OCL_TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
|
||||
|
||||
SANITY_CHECK(dst, eps);
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
} } // namespace opencv_test::ocl
|
||||
|
@ -202,8 +202,8 @@ PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear,
|
||||
|
||||
PERF_TEST_P( TestRemap, remap,
|
||||
Combine(
|
||||
Values( TYPICAL_MAT_TYPES ),
|
||||
Values( szVGA, sz720p, sz1080p ),
|
||||
Values( CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 ),
|
||||
Values( szVGA, sz1080p ),
|
||||
InterType::all(),
|
||||
BorderMode::all(),
|
||||
RemapMode::all()
|
||||
@ -231,7 +231,7 @@ PERF_TEST_P( TestRemap, remap,
|
||||
remap(source, destination, map_x, map_y, interpolationType, borderMode);
|
||||
}
|
||||
|
||||
SANITY_CHECK(destination, 1);
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode )
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user