mirror of
https://github.com/opencv/opencv.git
synced 2024-11-23 18:50:21 +08:00
Use generic SIMD in warpAffineBlocklineNN
This commit is contained in:
parent
450e741f8d
commit
45b9398d68
@ -2703,39 +2703,30 @@ void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0,
|
||||
{
|
||||
CALL_HAL(warpAffineBlocklineNN, cv_hal_warpAffineBlocklineNN, adelta, bdelta, xy, X0, Y0, bw);
|
||||
|
||||
const int AB_BITS = MAX(10, (int)INTER_BITS);
|
||||
constexpr int AB_BITS = MAX(10, static_cast<int>(INTER_BITS));
|
||||
int x1 = 0;
|
||||
#if CV_TRY_SSE4_1
|
||||
bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
|
||||
if( useSSE4_1 )
|
||||
opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta, bdelta, xy, X0, Y0, bw);
|
||||
else
|
||||
#endif
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
{
|
||||
#if CV_SIMD128
|
||||
const v_int32 v_X0 = vx_setall_s32(X0);
|
||||
const v_int32 v_Y0 = vx_setall_s32(Y0);
|
||||
const int step = VTraits<v_int16>::vlanes();
|
||||
for (; x1 <= bw - step; x1 += step)
|
||||
{
|
||||
v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
|
||||
int span = VTraits<v_uint16x8>::vlanes();
|
||||
for( ; x1 <= bw - span; x1 += span )
|
||||
{
|
||||
v_int16x8 v_dst[2];
|
||||
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
|
||||
v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
|
||||
v_dst[0] = CV_CONVERT_MAP(adelta, x1, v_X0);
|
||||
v_dst[1] = CV_CONVERT_MAP(bdelta, x1, v_Y0);
|
||||
#undef CV_CONVERT_MAP
|
||||
v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; x1 < bw; x1++ )
|
||||
{
|
||||
int X = (X0 + adelta[x1]) >> AB_BITS;
|
||||
int Y = (Y0 + bdelta[x1]) >> AB_BITS;
|
||||
xy[x1*2] = saturate_cast<short>(X);
|
||||
xy[x1*2+1] = saturate_cast<short>(Y);
|
||||
v_int16 v_X = v_pack(v_shr<AB_BITS>(v_add(v_X0, vx_load(adelta + x1))),
|
||||
v_shr<AB_BITS>(v_add(v_X0, vx_load(adelta + x1 + step / 2))));
|
||||
v_int16 v_Y = v_pack(v_shr<AB_BITS>(v_add(v_Y0, vx_load(bdelta + x1))),
|
||||
v_shr<AB_BITS>(v_add(v_Y0, vx_load(bdelta + x1 + step / 2))));
|
||||
v_store_interleave(xy + 2 * x1, v_X, v_Y);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (; x1 < bw; x1++)
|
||||
{
|
||||
const int X = (X0 + adelta[x1]) >> AB_BITS;
|
||||
const int Y = (Y0 + bdelta[x1]) >> AB_BITS;
|
||||
xy[x1 * 2] = saturate_cast<short>(X);
|
||||
xy[x1 * 2 + 1] = saturate_cast<short>(Y);
|
||||
}
|
||||
}
|
||||
|
||||
void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
|
||||
|
@ -74,7 +74,6 @@ namespace opt_SSE4_1
|
||||
void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width);
|
||||
void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width);
|
||||
void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width);
|
||||
void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
|
||||
|
||||
class WarpPerspectiveLine_SSE4
|
||||
{
|
||||
|
@ -173,42 +173,6 @@ void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, i
|
||||
}
|
||||
}
|
||||
|
||||
void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
|
||||
{
|
||||
const int AB_BITS = MAX(10, (int)INTER_BITS);
|
||||
int x1 = 0;
|
||||
|
||||
__m128i v_X0 = _mm_set1_epi32(X0);
|
||||
__m128i v_Y0 = _mm_set1_epi32(Y0);
|
||||
for (; x1 <= bw - 16; x1 += 16)
|
||||
{
|
||||
__m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1))), AB_BITS),
|
||||
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 4))), AB_BITS));
|
||||
__m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 8))), AB_BITS),
|
||||
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 12))), AB_BITS));
|
||||
|
||||
__m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1))), AB_BITS),
|
||||
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 4))), AB_BITS));
|
||||
__m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 8))), AB_BITS),
|
||||
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 12))), AB_BITS));
|
||||
|
||||
_mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
|
||||
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
|
||||
}
|
||||
for (; x1 < bw; x1++)
|
||||
{
|
||||
int X = (X0 + adelta[x1]) >> AB_BITS;
|
||||
int Y = (Y0 + bdelta[x1]) >> AB_BITS;
|
||||
xy[x1 * 2] = saturate_cast<short>(X);
|
||||
xy[x1 * 2 + 1] = saturate_cast<short>(Y);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class WarpPerspectiveLine_SSE4_Impl CV_FINAL : public WarpPerspectiveLine_SSE4
|
||||
{
|
||||
public:
|
||||
|
Loading…
Reference in New Issue
Block a user