Use generic SIMD in warpAffineBlocklineNN

2025-07-20 11:06:38 +08:00 · 2024-09-27 01:35:04 +08:00 · 2024-09-27 01:35:04 +08:00 · 45b9398d68
commit 45b9398d68
parent 450e741f8d
3 changed files with 19 additions and 65 deletions
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -2703,39 +2703,30 @@ void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0,
 {
    CALL_HAL(warpAffineBlocklineNN, cv_hal_warpAffineBlocklineNN, adelta, bdelta, xy, X0, Y0, bw);

-    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    constexpr int AB_BITS = MAX(10, static_cast<int>(INTER_BITS));
    int x1 = 0;
-    #if CV_TRY_SSE4_1
-    bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
-    if( useSSE4_1 )
-        opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta, bdelta, xy, X0, Y0, bw);
-    else
-    #endif
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    {
-        #if CV_SIMD128
+        const v_int32 v_X0 = vx_setall_s32(X0);
+        const v_int32 v_Y0 = vx_setall_s32(Y0);
+        const int step = VTraits<v_int16>::vlanes();
+        for (; x1 <= bw - step; x1 += step)
        {
-            v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
-            int span = VTraits<v_uint16x8>::vlanes();
-            for( ; x1 <= bw - span; x1 += span )
-            {
-                v_int16x8 v_dst[2];
-                #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
-                                                                v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
-                v_dst[0] = CV_CONVERT_MAP(adelta, x1, v_X0);
-                v_dst[1] = CV_CONVERT_MAP(bdelta, x1, v_Y0);
-                #undef CV_CONVERT_MAP
-                v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
-            }
-        }
-        #endif
-        for( ; x1 < bw; x1++ )
-        {
-            int X = (X0 + adelta[x1]) >> AB_BITS;
-            int Y = (Y0 + bdelta[x1]) >> AB_BITS;
-            xy[x1*2] = saturate_cast<short>(X);
-            xy[x1*2+1] = saturate_cast<short>(Y);
+            v_int16 v_X = v_pack(v_shr<AB_BITS>(v_add(v_X0, vx_load(adelta + x1))),
+                                 v_shr<AB_BITS>(v_add(v_X0, vx_load(adelta + x1 + step / 2))));
+            v_int16 v_Y = v_pack(v_shr<AB_BITS>(v_add(v_Y0, vx_load(bdelta + x1))),
+                                 v_shr<AB_BITS>(v_add(v_Y0, vx_load(bdelta + x1 + step / 2))));
+            v_store_interleave(xy + 2 * x1, v_X, v_Y);
        }
    }
+#endif
+    for (; x1 < bw; x1++)
+    {
+        const int X = (X0 + adelta[x1]) >> AB_BITS;
+        const int Y = (Y0 + bdelta[x1]) >> AB_BITS;
+        xy[x1 * 2] = saturate_cast<short>(X);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y);
+    }
 }

 void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
--- a/modules/imgproc/src/imgwarp.hpp
+++ b/modules/imgproc/src/imgwarp.hpp
@ -74,7 +74,6 @@ namespace opt_SSE4_1
 void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width);
 void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width);
 void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width);
-void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);

 class WarpPerspectiveLine_SSE4
 {
--- a/modules/imgproc/src/imgwarp.sse4_1.cpp
+++ b/modules/imgproc/src/imgwarp.sse4_1.cpp
@ -173,42 +173,6 @@ void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, i
    }
 }

-void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
-{
-    const int AB_BITS = MAX(10, (int)INTER_BITS);
-    int x1 = 0;
-
-    __m128i v_X0 = _mm_set1_epi32(X0);
-    __m128i v_Y0 = _mm_set1_epi32(Y0);
-    for (; x1 <= bw - 16; x1 += 16)
-    {
-        __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1))), AB_BITS),
-            _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 4))), AB_BITS));
-        __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 8))), AB_BITS),
-            _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 12))), AB_BITS));
-
-        __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1))), AB_BITS),
-            _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 4))), AB_BITS));
-        __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 8))), AB_BITS),
-            _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 12))), AB_BITS));
-
-        _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
-
-        _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
-        _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
-        _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
-        _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
-    }
-    for (; x1 < bw; x1++)
-    {
-        int X = (X0 + adelta[x1]) >> AB_BITS;
-        int Y = (Y0 + bdelta[x1]) >> AB_BITS;
-        xy[x1 * 2] = saturate_cast<short>(X);
-        xy[x1 * 2 + 1] = saturate_cast<short>(Y);
-    }
-}
-
-
 class WarpPerspectiveLine_SSE4_Impl CV_FINAL : public WarpPerspectiveLine_SSE4
 {
 public: