GAPI Fluid Resize:AVX2 implementation.

2024-11-27 20:50:25 +08:00 · 2022-03-01 11:11:49 +03:00 · 2022-03-01 11:11:49 +03:00 · abd237c67a
commit abd237c67a
parent 13a995cc1d
3 changed files with 533 additions and 8 deletions
--- a/modules/gapi/include/opencv2/gapi/imgproc.hpp
+++ b/modules/gapi/include/opencv2/gapi/imgproc.hpp
@ -497,17 +497,20 @@ namespace imgproc {
    };

    G_TYPED_KERNEL(GResize, <GMat(GMat,Size,double,double,int)>, "org.opencv.imgproc.transform.resize") {
-        static GMatDesc outMeta(GMatDesc in, Size sz, double fx, double fy, int /*interp*/) {
+        static GMatDesc outMeta(GMatDesc in, Size sz, double fx, double fy, int interp) {
+            GAPI_Assert(in.depth == CV_8U);
+            GAPI_Assert(in.chan == 3);
+            GAPI_Assert(interp == cv::INTER_LINEAR);
            if (sz.width != 0 && sz.height != 0)
            {
-                return in.withSize(sz);
+                return in.withType(CV_8U, 3).withSize(sz);
            }
            else
            {
                int outSz_w = saturate_cast<int>(in.size.width  * fx);
                int outSz_h = saturate_cast<int>(in.size.height * fy);
                GAPI_Assert(outSz_w > 0 && outSz_h > 0);
-                return in.withSize(Size(outSz_w, outSz_h));
+                return in.withType(CV_8U, 3).withSize(Size(outSz_w, outSz_h));
            }
        }
    };
--- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
@ -2024,7 +2024,8 @@ struct Mapper {
 template<typename T, class Mapper, int numChan>
 CV_ALWAYS_INLINE void calcRowLinearC(const cv::gapi::fluid::View  & in,
                                     cv::gapi::fluid::Buffer& out,
-                                     cv::gapi::fluid::Buffer& scratch) {
+                                     cv::gapi::fluid::Buffer& scratch)
+{
    using alpha_type = typename Mapper::alpha_type;

    auto  inSz =  in.meta().size;
@ -2057,10 +2058,26 @@ CV_ALWAYS_INLINE void calcRowLinearC(const cv::gapi::fluid::View  & in,
        dst[l] = out.OutLine<T>(l);
    }

-#if CV_SSE4_1
+#if CV_SIMD
    const auto* clone = scr.clone;
    auto* tmp = scr.tmp;
+#if CV_AVX2
+    if (inSz.width >= 32 && outSz.width >= 32)
+    {
+        avx2::calcRowLinear_8UC_Impl_(reinterpret_cast<uint8_t**>(dst),
+                                      reinterpret_cast<const uint8_t**>(src0),
+                                      reinterpret_cast<const uint8_t**>(src1),
+                                      reinterpret_cast<const short*>(alpha),
+                                      reinterpret_cast<const short*>(clone),
+                                      reinterpret_cast<const short*>(mapsx),
+                                      reinterpret_cast<const short*>(beta),
+                                      reinterpret_cast<uint8_t*>(tmp),
+                                      inSz, outSz, lpi);

+        return;
+    }
+#endif // CV_AVX2
+#if CV_SSE4_1
    if (inSz.width >= 16 && outSz.width >= 16)
    {
        sse41::calcRowLinear_8UC_Impl_<numChan>(reinterpret_cast<uint8_t**>(dst),
@ -2076,20 +2093,21 @@ CV_ALWAYS_INLINE void calcRowLinearC(const cv::gapi::fluid::View  & in,
        return;
    }
 #endif // CV_SSE4_1
+#endif // CV_SIMD
    int length = out.length();
-    for (int l = 0; l < lpi; l++) {
+    for (int l = 0; l < lpi; ++l) {
        constexpr static const auto unity = Mapper::unity;

        auto beta0 =                                   beta[l];
        auto beta1 = saturate_cast<alpha_type>(unity - beta[l]);

-        for (int x = 0; x < length; x++) {
+        for (int x = 0; x < length; ++x) {
            auto alpha0 =                                   alpha[x];
            auto alpha1 = saturate_cast<alpha_type>(unity - alpha[x]);
            auto sx0 = mapsx[x];
            auto sx1 = sx0 + 1;

-            for (int c = 0; c < numChan; c++) {
+            for (int c = 0; c < numChan; ++c) {
                auto idx0 = numChan*sx0 + c;
                auto idx1 = numChan*sx1 + c;
                T tmp0 = resize_calc_revert_fixedpoint(beta0, src0[l][idx0], beta1, src1[l][idx0]);
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp
@ -174,6 +174,510 @@ CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(float *dst[],
        memcpy(dst[0], src0[0], length * sizeof(float)*lpi);
    }
 }
+
+CV_ALWAYS_INLINE void resize_horizontal_anyLPI(uint8_t* dst,
+                                               const uchar* src, const short mapsx[],
+                                               const short alpha[], const int width)
+{
+    constexpr int nlanes = 16;
+    constexpr int chanNum = 3;
+    __m128i zero = _mm_setzero_si128();
+
+    for (int x = 0; width >= nlanes;)
+    {
+        for (; x <= width - nlanes; x += nlanes)
+        {
+            __m128i a012 = _mm_setr_epi16(alpha[x], alpha[x], alpha[x], alpha[x + 1],
+                                          alpha[x + 1], alpha[x + 1], alpha[x + 2], alpha[x + 2]);
+            __m128i a2345 = _mm_setr_epi16(alpha[x + 2], alpha[x + 3], alpha[x + 3], alpha[x + 3],
+                                           alpha[x + 4], alpha[x + 4], alpha[x + 4], alpha[x + 5]);
+
+            __m128i a567 = _mm_setr_epi16(alpha[x + 5], alpha[x + 5], alpha[x + 6], alpha[x + 6],
+                                          alpha[x + 6], alpha[x + 7], alpha[x + 7], alpha[x + 7]);
+            __m128i a8910 = _mm_setr_epi16(alpha[x + 8], alpha[x + 8], alpha[x + 8], alpha[x + 9],
+                                           alpha[x + 9], alpha[x + 9], alpha[x + 10], alpha[x + 10]);
+
+            __m128i a10111213 = _mm_setr_epi16(alpha[x + 10], alpha[x + 11], alpha[x + 11], alpha[x + 11],
+                                               alpha[x + 12], alpha[x + 12], alpha[x + 12], alpha[x + 13]);
+            __m128i a131415 = _mm_setr_epi16(alpha[x + 13], alpha[x + 13], alpha[x + 14], alpha[x + 14],
+                                             alpha[x + 14], alpha[x + 15], alpha[x + 15], alpha[x + 15]);
+
+            __m128i a1 = _mm_setr_epi8(src[chanNum * (mapsx[x] + 0)],     src[chanNum * (mapsx[x] + 0) + 1],     src[chanNum * (mapsx[x] + 0) + 2],
+                                       src[chanNum * (mapsx[x + 1] + 0)], src[chanNum * (mapsx[x + 1] + 0) + 1], src[chanNum * (mapsx[x + 1] + 0) + 2],
+                                       src[chanNum * (mapsx[x + 2] + 0)], src[chanNum * (mapsx[x + 2] + 0) + 1], src[chanNum * (mapsx[x + 2] + 0) + 2],
+                                       src[chanNum * (mapsx[x + 3] + 0)], src[chanNum * (mapsx[x + 3] + 0) + 1], src[chanNum * (mapsx[x + 3] + 0) + 2],
+                                       src[chanNum * (mapsx[x + 4] + 0)], src[chanNum * (mapsx[x + 4] + 0) + 1], src[chanNum * (mapsx[x + 4] + 0) + 2],
+                                       src[chanNum * (mapsx[x + 5] + 0)]);
+            __m128i b1 = _mm_setr_epi8(src[chanNum * (mapsx[x] + 1)],     src[chanNum * (mapsx[x] + 1) + 1],     src[chanNum * (mapsx[x] + 1) + 2],
+                                       src[chanNum * (mapsx[x + 1] + 1)], src[chanNum * (mapsx[x + 1] + 1) + 1], src[chanNum * (mapsx[x + 1] + 1) + 2],
+                                       src[chanNum * (mapsx[x + 2] + 1)], src[chanNum * (mapsx[x + 2] + 1) + 1], src[chanNum * (mapsx[x + 2] + 1) + 2],
+                                       src[chanNum * (mapsx[x + 3] + 1)], src[chanNum * (mapsx[x + 3] + 1) + 1], src[chanNum * (mapsx[x + 3] + 1) + 2],
+                                       src[chanNum * (mapsx[x + 4] + 1)], src[chanNum * (mapsx[x + 4] + 1) + 1], src[chanNum * (mapsx[x + 4] + 1) + 2],
+                                       src[chanNum * (mapsx[x + 5] + 1)]);
+
+            __m128i a2 = _mm_setr_epi8(src[chanNum * (mapsx[x + 5] + 0) + 1], src[chanNum * (mapsx[x + 5] + 0) + 2], src[chanNum * (mapsx[x + 6] + 0)],
+                                       src[chanNum * (mapsx[x + 6] + 0) + 1], src[chanNum * (mapsx[x + 6] + 0) + 2], src[chanNum * (mapsx[x + 7] + 0)],
+                                       src[chanNum * (mapsx[x + 7] + 0) + 1], src[chanNum * (mapsx[x + 7] + 0) + 2], src[chanNum * (mapsx[x + 8] + 0)],
+                                       src[chanNum * (mapsx[x + 8] + 0) + 1], src[chanNum * (mapsx[x + 8] + 0) + 2], src[chanNum * (mapsx[x + 9] + 0)],
+                                       src[chanNum * (mapsx[x + 9] + 0) + 1], src[chanNum * (mapsx[x + 9] + 0) + 2], src[chanNum * (mapsx[x + 10] + 0)],
+                                       src[chanNum * (mapsx[x + 10] + 0) + 1]);
+
+            __m128i b2 = _mm_setr_epi8(src[chanNum * (mapsx[x + 5] + 1) + 1], src[chanNum * (mapsx[x + 5] + 1) + 2], src[chanNum * (mapsx[x + 6] + 1)],
+                                       src[chanNum * (mapsx[x + 6] + 1) + 1], src[chanNum * (mapsx[x + 6] + 1) + 2], src[chanNum * (mapsx[x + 7] + 1)],
+                                       src[chanNum * (mapsx[x + 7] + 1) + 1], src[chanNum * (mapsx[x + 7] + 1) + 2], src[chanNum * (mapsx[x + 8] + 1)],
+                                       src[chanNum * (mapsx[x + 8] + 1) + 1], src[chanNum * (mapsx[x + 8] + 1) + 2], src[chanNum * (mapsx[x + 9] + 1)],
+                                       src[chanNum * (mapsx[x + 9] + 1) + 1], src[chanNum * (mapsx[x + 9] + 1) + 2], src[chanNum * (mapsx[x + 10] + 1)],
+                                       src[chanNum * (mapsx[x + 10] + 1) + 1]);
+
+            __m128i a3 = _mm_setr_epi8(src[chanNum * (mapsx[x + 10] + 0) + 2], src[chanNum * (mapsx[x + 11] + 0)], src[chanNum * (mapsx[x + 11] + 0) + 1],
+                                       src[chanNum * (mapsx[x + 11] + 0) + 2], src[chanNum * (mapsx[x + 12] + 0)], src[chanNum * (mapsx[x + 12] + 0) + 1],
+                                       src[chanNum * (mapsx[x + 12] + 0) + 2], src[chanNum * (mapsx[x + 13] + 0)], src[chanNum * (mapsx[x + 13] + 0) + 1],
+                                       src[chanNum * (mapsx[x + 13] + 0) + 2], src[chanNum * (mapsx[x + 14] + 0)], src[chanNum * (mapsx[x + 14] + 0) + 1],
+                                       src[chanNum * (mapsx[x + 14] + 0) + 2], src[chanNum * (mapsx[x + 15] + 0)], src[chanNum * (mapsx[x + 15] + 0) + 1],
+                                       src[chanNum * (mapsx[x + 15] + 0) + 2]);
+
+            __m128i b3 = _mm_setr_epi8(src[chanNum * (mapsx[x + 10] + 1) + 2], src[chanNum * (mapsx[x + 11] + 1)], src[chanNum * (mapsx[x + 11] + 1) + 1],
+                                       src[chanNum * (mapsx[x + 11] + 1) + 2], src[chanNum * (mapsx[x + 12] + 1)], src[chanNum * (mapsx[x + 12] + 1) + 1],
+                                       src[chanNum * (mapsx[x + 12] + 1) + 2], src[chanNum * (mapsx[x + 13] + 1)], src[chanNum * (mapsx[x + 13] + 1) + 1],
+                                       src[chanNum * (mapsx[x + 13] + 1) + 2], src[chanNum * (mapsx[x + 14] + 1)], src[chanNum * (mapsx[x + 14] + 1) + 1],
+                                       src[chanNum * (mapsx[x + 14] + 1) + 2], src[chanNum * (mapsx[x + 15] + 1)], src[chanNum * (mapsx[x + 15] + 1) + 1],
+                                       src[chanNum * (mapsx[x + 15] + 1) + 2]);
+
+            __m128i a11 = _mm_unpacklo_epi8(a1, zero);
+            __m128i a12 = _mm_unpackhi_epi8(a1, zero);
+            __m128i a21 = _mm_unpacklo_epi8(a2, zero);
+            __m128i a22 = _mm_unpackhi_epi8(a2, zero);
+            __m128i a31 = _mm_unpacklo_epi8(a3, zero);
+            __m128i a32 = _mm_unpackhi_epi8(a3, zero);
+            __m128i b11 = _mm_unpacklo_epi8(b1, zero);
+            __m128i b12 = _mm_unpackhi_epi8(b1, zero);
+            __m128i b21 = _mm_unpacklo_epi8(b2, zero);
+            __m128i b22 = _mm_unpackhi_epi8(b2, zero);
+            __m128i b31 = _mm_unpacklo_epi8(b3, zero);
+            __m128i b32 = _mm_unpackhi_epi8(b3, zero);
+
+            __m128i r1 = _mm_mulhrs_epi16(_mm_sub_epi16(a11, b11), a012);
+            __m128i r2 = _mm_mulhrs_epi16(_mm_sub_epi16(a12, b12), a2345);
+            __m128i r3 = _mm_mulhrs_epi16(_mm_sub_epi16(a21, b21), a567);
+            __m128i r4 = _mm_mulhrs_epi16(_mm_sub_epi16(a22, b22), a8910);
+            __m128i r5 = _mm_mulhrs_epi16(_mm_sub_epi16(a31, b31), a10111213);
+            __m128i r6 = _mm_mulhrs_epi16(_mm_sub_epi16(a32, b32), a131415);
+
+            __m128i r_1 = _mm_add_epi16(b11, r1);
+            __m128i r_2 = _mm_add_epi16(b12, r2);
+            __m128i r_3 = _mm_add_epi16(b21, r3);
+            __m128i r_4 = _mm_add_epi16(b22, r4);
+            __m128i r_5 = _mm_add_epi16(b31, r5);
+            __m128i r_6 = _mm_add_epi16(b32, r6);
+
+            __m128i res1 = _mm_packus_epi16(r_1, r_2);
+            __m128i res2 = _mm_packus_epi16(r_3, r_4);
+            __m128i res3 = _mm_packus_epi16(r_5, r_6);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[chanNum * x]), res1);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[chanNum * x + 16]), res2);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[chanNum * x + 32]), res3);
+        }
+        if (x < width) {
+            x = width - nlanes;
+            continue;
+        }
+        break;
+    }
+}
+
+CV_ALWAYS_INLINE void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[],
+                                           uint8_t tmp[], const short beta[],
+                                           const int& length)
+{
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+    GAPI_DbgAssert(length >= half_nlanes);
+
+    __m256i b0 = _mm256_set1_epi16(beta[0]);
+    __m256i b1 = _mm256_set1_epi16(beta[1]);
+    __m256i b2 = _mm256_set1_epi16(beta[2]);
+    __m256i b3 = _mm256_set1_epi16(beta[3]);
+
+    __m256i shuf_mask = _mm256_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13,
+                                         2, 10, 6, 14, 3, 11, 7, 15,
+                                         0, 8, 4, 12, 1, 9, 5, 13,
+                                         2, 10, 6, 14, 3, 11, 7, 15);
+    for (int w = 0; w < length; ) {
+        for (; w <= length - half_nlanes; w += half_nlanes)
+        {
+            __m256i val0_0 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src0[0][w])));
+            __m256i val0_1 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src0[1][w])));
+            __m256i val0_2 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src0[2][w])));
+            __m256i val0_3 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src0[3][w])));
+
+            __m256i val1_0 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src1[0][w])));
+            __m256i val1_1 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src1[1][w])));
+            __m256i val1_2 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src1[2][w])));
+            __m256i val1_3 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src1[3][w])));
+
+            __m256i t0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(val0_0, val1_0), b0);
+            __m256i t1 = _mm256_mulhrs_epi16(_mm256_sub_epi16(val0_1, val1_1), b1);
+            __m256i t2 = _mm256_mulhrs_epi16(_mm256_sub_epi16(val0_2, val1_2), b2);
+            __m256i t3 = _mm256_mulhrs_epi16(_mm256_sub_epi16(val0_3, val1_3), b3);
+
+            __m256i r0 = _mm256_add_epi16(val1_0, t0);
+            __m256i r1 = _mm256_add_epi16(val1_1, t1);
+            __m256i r2 = _mm256_add_epi16(val1_2, t2);
+            __m256i r3 = _mm256_add_epi16(val1_3, t3);
+
+            __m256i q0 = _mm256_packus_epi16(r0, r1);
+            __m256i q1 = _mm256_packus_epi16(r2, r3);
+
+            __m256i q2 = _mm256_blend_epi16(q0, _mm256_slli_si256(q1, 4), 0xCC /*0b11001100*/);
+            __m256i q3 = _mm256_blend_epi16(_mm256_srli_si256(q0, 4), q1, 0xCC /*0b11001100*/);
+
+            __m256i q4 = _mm256_shuffle_epi8(q2, shuf_mask);
+            __m256i q5 = _mm256_shuffle_epi8(q3, shuf_mask);
+
+            __m256i q6 = _mm256_permute2x128_si256(q4, q5, 0x20);
+            __m256i q7 = _mm256_permute2x128_si256(q4, q5, 0x31);
+
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(&tmp[4 * w + 0]), q6);
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(&tmp[4 * w + 2 * half_nlanes]), q7);
+        }
+
+        if (w < length)
+        {
+            w = length - half_nlanes;
+        }
+    }
+}
+
+CV_ALWAYS_INLINE void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
+                                             uint8_t tmp[], const int& beta0,
+                                             const int l, const int length1, const int length2) {
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+    GAPI_DbgAssert(length1 >= half_nlanes);
+
+    for (int w = 0; w < length2; ) {
+        for (; w <= length1 - half_nlanes; w += half_nlanes) {
+            v_int16x16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
+            v_int16x16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
+            v_int16x16 t = v_mulhrs(s0 - s1, beta0) + s1;
+            v_pack_u_store(tmp + w, t);
+        }
+
+        if (w < length1) {
+            w = length1 - half_nlanes;
+        }
+    }
+}
+
+CV_ALWAYS_INLINE v_int16x16 v_gather_chan(const uchar src[], const v_int16x16& index, int channel, int pos)
+{
+    constexpr int chanNum = 3;
+    v_int16x16 r;
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 0) + pos) + channel]), 0);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 1) + pos) + channel]), 1);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 2) + pos) + channel]), 2);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 3) + pos) + channel]), 3);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 4) + pos) + channel]), 4);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 5) + pos) + channel]), 5);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 6) + pos) + channel]), 6);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 7) + pos) + channel]), 7);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 8) + pos) + channel]), 8);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 9) + pos) + channel]), 9);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 10) + pos) + channel]), 10);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 11) + pos) + channel]), 11);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 12) + pos) + channel]), 12);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 13) + pos) + channel]), 13);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 14) + pos) + channel]), 14);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 15) + pos) + channel]), 15);
+    return r;
+}
+
+CV_ALWAYS_INLINE void v_gather_channel(v_uint8x32& vec, const uint8_t tmp[], const short mapsx[],
+                                       int chanNum, int c, int x, int shift)
+{
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 0] + c)]), 0);
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 1] + c)]), 1);
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 2] + c)]), 2);
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + shift + 3] + c)]), 3);
+
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 0] + 1) + c)]), 4);
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 1] + 1) + c)]), 5);
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 2] + 1) + c)]), 6);
+    vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 3] + 1) + c)]), 7);
+}
+
+CV_ALWAYS_INLINE v_int16x16 v_mulhrs(const v_int16x16& a, const v_int16x16& b)
+{
+    return v_int16x16(_mm256_mulhrs_epi16(a.val, b.val));
+}
+
+static inline v_int16x16 v_mulhrs(const v_int16x16& a, short b)
+{
+    return v_mulhrs(a, v256_setall_s16(b));
+}
+
+CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl(uint8_t* dst[],
+                                             const uint8_t* src0[],
+                                             const uint8_t* src1[],
+                                             const short    alpha[],
+                                             const short*   clone,  // 4 clones of alpha
+                                             const short    mapsx[],
+                                             const short    beta[],
+                                                 uint8_t    tmp[],
+                                             const Size&    inSz,
+                                             const Size&    outSz,
+                                             const int      lpi)
+{
+    constexpr int nlanes = 32; // number of 8-bit integers that fit into a 256-bit SIMD vector.
+    constexpr int half_nlanes = nlanes / 2;
+    constexpr int chanNum = 3;
+
+    if ((inSz.width * chanNum < half_nlanes) || (outSz.width < half_nlanes))
+        return;
+
+    const int shift = (half_nlanes / 4);
+
+    if (4 == lpi)
+    {
+        verticalPass_lpi4_8U(src0, src1, tmp, beta,
+                             inSz.width*chanNum);
+
+        // horizontal pass
+        __m256i shuff_mask = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+                                              0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+        __m256i perm_mask = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+
+        constexpr int nproc_pixels = 5;
+        for (int x = 0;;)
+        {
+            for (; x <= outSz.width - (nproc_pixels + 1); x += nproc_pixels)
+            {
+                v_int16x16 a10 = vx_load(&clone[4 * x]);
+                v_int16x16 a32 = vx_load(&clone[4 * (x + 4)]);
+                v_int16x16 a54 = vx_load(&clone[4 * (x + 8)]);
+                v_int16x16 a76 = vx_load(&clone[4 * (x + 12)]);
+
+                __m128i pix1 = _mm_setzero_si128();
+                pix1 = _mm_insert_epi64(pix1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x])    ]), 0);
+                pix1 = _mm_insert_epi32(pix1, *reinterpret_cast<const int*>(    &tmp[4 * (chanNum * mapsx[x]) + 8]), 2);
+
+                __m128i pix2 = _mm_setzero_si128();
+                pix2 = _mm_insert_epi64(pix2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x] + 1))    ]), 0);
+                pix2 = _mm_insert_epi32(pix2, *reinterpret_cast<const int*>(    &tmp[4 * (chanNum * (mapsx[x] + 1)) + 8]), 2);
+
+                __m128i pix3 = _mm_setzero_si128();
+                pix3 = _mm_insert_epi64(pix3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 1])    ]), 0);
+                pix3 = _mm_insert_epi32(pix3, *reinterpret_cast<const int*>(    &tmp[4 * (chanNum * mapsx[x + 1]) + 8]), 2);
+
+                __m128i pix4 = _mm_setzero_si128();
+                pix4 = _mm_insert_epi64(pix4, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 1] + 1))    ]), 0);
+                pix4 = _mm_insert_epi32(pix4, *reinterpret_cast<const int*>(    &tmp[4 * (chanNum * (mapsx[x + 1] + 1)) + 8]), 2);
+
+                __m256i ext_pix1 = _mm256_cvtepi8_epi16(pix1);
+                __m256i ext_pix2 = _mm256_cvtepi8_epi16(pix2);
+                __m256i ext_pix3 = _mm256_cvtepi8_epi16(pix3);
+                __m256i ext_pix4 = _mm256_cvtepi8_epi16(pix4);
+
+                __m256i t0_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix1, ext_pix2), a00);
+                __m256i t1_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix3, ext_pix4), a11);
+
+                __m256i r0_0 = _mm256_add_epi16(ext_pix2, t0_0);
+                __m256i r1_0 = _mm256_add_epi16(ext_pix4, t1_0);
+
+                __m256i q0_0 = _mm256_packus_epi16(r0_0, r1_0);
+
+                __m256i perm64 = _mm256_permute4x64_epi64(q0_0, 216 /*11011000*/);
+                __m256i shuf = _mm256_shuffle_epi8(perm64, shuff_mask);
+
+                __m256i res1 = _mm256_permutevar8x32_epi32(shuf, perm_mask);
+
+                pix1 = _mm_insert_epi64(pix1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 2])    ]), 0);
+                pix1 = _mm_insert_epi32(pix1, *reinterpret_cast<const int*>(    &tmp[4 * (chanNum * mapsx[x + 2]) + 8]), 2);
+
+                pix2 = _mm_insert_epi64(pix2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 2] + 1))    ]), 0);
+                pix2 = _mm_insert_epi32(pix2, *reinterpret_cast<const int*>(    &tmp[4 * (chanNum * (mapsx[x + 2] + 1)) + 8]), 2);
+
+                pix3 = _mm_insert_epi64(pix3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 3])    ]), 0);
+                pix3 = _mm_insert_epi32(pix3, *reinterpret_cast<const int*>(    &tmp[4 * (chanNum * mapsx[x + 3]) + 8]), 2);
+
+                pix4 = _mm_insert_epi64(pix4, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 3] + 1))    ]), 0);
+                pix4 = _mm_insert_epi32(pix4, *reinterpret_cast<const int*>(    &tmp[4 * (chanNum * (mapsx[x + 3] + 1)) + 8]), 2);
+
+                ext_pix1 = _mm256_cvtepi8_epi16(pix1);
+                ext_pix2 = _mm256_cvtepi8_epi16(pix2);
+                ext_pix3 = _mm256_cvtepi8_epi16(pix3);
+                ext_pix4 = _mm256_cvtepi8_epi16(pix4);
+
+                t0_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix1, ext_pix2), a22);
+                t1_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix3, ext_pix4), a33);
+
+                r0_0 = _mm256_add_epi16(ext_pix2, t0_0);
+                r1_0 = _mm256_add_epi16(ext_pix4, t1_0);
+
+                q0_0 = _mm256_packus_epi16(r0_0, r1_0);
+
+                perm64 = _mm256_permute4x64_epi64(q0_0, 216 /*11011000*/);
+                shuf = _mm256_shuffle_epi8(perm64, shuff_mask);
+
+                __m256i res2 = _mm256_permutevar8x32_epi32(shuf, perm_mask);
+
+                pix1 = _mm_insert_epi64(pix1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 4])]), 0);
+                pix1 = _mm_insert_epi32(pix1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 4]) + 8]), 2);
+
+                pix2 = _mm_insert_epi64(pix2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 4] + 1))]), 0);
+                pix2 = _mm_insert_epi32(pix2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 4] + 1)) + 8]), 2);
+
+                pix3 = _mm_insert_epi64(pix3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 5])]), 0);
+                pix3 = _mm_insert_epi32(pix3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 5]) + 8]), 2);
+
+                pix4 = _mm_insert_epi64(pix4, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 5] + 1))]), 0);
+                pix4 = _mm_insert_epi32(pix4, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 5] + 1)) + 8]), 2);
+
+                ext_pix1 = _mm256_cvtepi8_epi16(pix1);
+                ext_pix2 = _mm256_cvtepi8_epi16(pix2);
+                ext_pix3 = _mm256_cvtepi8_epi16(pix3);
+                ext_pix4 = _mm256_cvtepi8_epi16(pix4);
+
+                t0_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix1, ext_pix2), a44);
+                t1_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix3, ext_pix4), a55);
+
+                r0_0 = _mm256_add_epi16(ext_pix2, t0_0);
+                r1_0 = _mm256_add_epi16(ext_pix4, t1_0);
+
+                q0_0 = _mm256_packus_epi16(r0_0, r1_0);
+
+                perm64 = _mm256_permute4x64_epi64(q0_0, 216 /*11011000*/);
+                shuf = _mm256_shuffle_epi8(perm64, shuff_mask);
+
+                __m256i res3 = _mm256_permutevar8x32_epi32(shuf, perm_mask);
+
+                pix1 = _mm_insert_epi64(pix1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 6])]), 0);
+                pix1 = _mm_insert_epi32(pix1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 6]) + 8]), 2);
+
+                pix2 = _mm_insert_epi64(pix2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 6] + 1))]), 0);
+                pix2 = _mm_insert_epi32(pix2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 6] + 1)) + 8]), 2);
+
+                pix3 = _mm_insert_epi64(pix3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 7])]), 0);
+                pix3 = _mm_insert_epi32(pix3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 7]) + 8]), 2);
+
+                pix4 = _mm_insert_epi64(pix4, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 7] + 1))]), 0);
+                pix4 = _mm_insert_epi32(pix4, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 7] + 1)) + 8]), 2);
+
+                ext_pix1 = _mm256_cvtepi8_epi16(pix1);
+                ext_pix2 = _mm256_cvtepi8_epi16(pix2);
+                ext_pix3 = _mm256_cvtepi8_epi16(pix3);
+                ext_pix4 = _mm256_cvtepi8_epi16(pix4);
+
+                t0_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix1, ext_pix2), a66);
+                t1_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix3, ext_pix4), a77);
+
+                r0_0 = _mm256_add_epi16(ext_pix2, t0_0);
+                r1_0 = _mm256_add_epi16(ext_pix4, t1_0);
+
+                q0_0 = _mm256_packus_epi16(r0_0, r1_0);
+
+                perm64 = _mm256_permute4x64_epi64(q0_0, 216 /*11011000*/);
+                shuf = _mm256_shuffle_epi8(perm64, shuff_mask);
+
+                __m256i res4 = _mm256_permutevar8x32_epi32(shuf, perm_mask);
+
+                pix1 = _mm_insert_epi64(pix1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 8])]), 0);
+                pix1 = _mm_insert_epi32(pix1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 8]) + 8]), 2);
+
+                pix2 = _mm_insert_epi64(pix2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 8] + 1))]), 0);
+                pix2 = _mm_insert_epi32(pix2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 8] + 1)) + 8]), 2);
+
+                pix3 = _mm_insert_epi64(pix3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 9])]), 0);
+                pix3 = _mm_insert_epi32(pix3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 9]) + 8]), 2);
+
+                pix4 = _mm_insert_epi64(pix4, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 9] + 1))]), 0);
+                pix4 = _mm_insert_epi32(pix4, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 9] + 1)) + 8]), 2);
+
+                ext_pix1 = _mm256_cvtepi8_epi16(pix1);
+                ext_pix2 = _mm256_cvtepi8_epi16(pix2);
+                ext_pix3 = _mm256_cvtepi8_epi16(pix3);
+                ext_pix4 = _mm256_cvtepi8_epi16(pix4);
+
+                t0_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix1, ext_pix2), a88);
+                t1_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix3, ext_pix4), a99);
+
+                r0_0 = _mm256_add_epi16(ext_pix2, t0_0);
+                r1_0 = _mm256_add_epi16(ext_pix4, t1_0);
+
+                q0_0 = _mm256_packus_epi16(r0_0, r1_0);
+
+                perm64 = _mm256_permute4x64_epi64(q0_0, 216 /*11011000*/);
+                shuf = _mm256_shuffle_epi8(perm64, shuff_mask);
+
+                __m256i res5 = _mm256_permutevar8x32_epi32(shuf, perm_mask);
+
+                __m256i bl1 = _mm256_blend_epi16(res1, _mm256_slli_si256(res2, 8), 240 /*0b11110000*/);
+                __m256i bl2 = _mm256_blend_epi16(_mm256_srli_si256(res1, 8), res2, 240 /*0b11110000*/);
+
+                __m256i bl3 = _mm256_blend_epi16(res3, _mm256_slli_si256(res4, 8), 240 /*0b11110000*/);
+                __m256i bl4 = _mm256_blend_epi16(_mm256_srli_si256(res3, 8), res4, 240 /*0b11110000*/);
+
+                __m256i perm1 = _mm256_permute2x128_si256(bl1, bl3, 32);
+                __m256i perm1 = _mm256_permute2x128_si256(bl2, bl4, 32);
+
+                _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[0][chanNum * x]), bl1);
+                _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[1][chanNum * x]), bl2);
+                _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[2][chanNum * x]), bl3);
+                _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[3][chanNum * x]), bl4);
+            }
+
+            for (; x < outSz.width; ++x)
+                {
+                    constexpr static const int ONE = 1 << 15;
+                    constexpr static const int half = 1 << 14;
+                    auto alpha0 = alpha[x];
+                    auto alpha1 = saturate_cast<short>(ONE - alpha[x]);
+
+                    for (int c = 0; c < chanNum; ++c)
+                    {
+                        dst[0][chanNum * x + c] = (tmp[4 * (chanNum *  mapsx[x]      + c)    ] * alpha0 +
+                                                   tmp[4 * (chanNum * (mapsx[x] + 1) + c)    ] * alpha1 + half) >> 15;
+                        dst[1][chanNum * x + c] = (tmp[4 * (chanNum *  mapsx[x]      + c) + 1] * alpha0 +
+                                                   tmp[4 * (chanNum * (mapsx[x] + 1) + c) + 1] * alpha1 + half) >> 15;
+                        dst[2][chanNum * x + c] = (tmp[4 * (chanNum *  mapsx[x]      + c) + 2] * alpha0 +
+                                                   tmp[4 * (chanNum * (mapsx[x] + 1) + c) + 2] * alpha1 + half) >> 15;
+                        dst[3][chanNum * x + c] = (tmp[4 * (chanNum *  mapsx[x]      + c) + 3] * alpha0 +
+                                                   tmp[4 * (chanNum * (mapsx[x] + 1) + c) + 3] * alpha1 + half) >> 15;
+                    }
+                }
+
+                break;
+        }
+    }
+    else
+    {  // if any lpi
+        for (int l = 0; l < lpi; ++l) {
+            short beta0 = beta[l];
+
+            // vertical pass
+            verticalPass_anylpi_8U(src0, src1, tmp, beta0, l,
+                                   inSz.width*chanNum, inSz.width*chanNum);
+
+            // horizontal pass
+            for (int x = 0; x < outSz.width; ) {
+                for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+                    for (int c = 0; c < chanNum; ++c) {
+                        v_int16x16 a0 = v256_load(&alpha[x]);        // as signed Q1.1.14
+                        v_int16x16 sx = v256_load(&mapsx[x]);        // as integer (int16)
+                        v_int16x16 t0 = v_gather_chan(tmp, sx, c, 0);
+                        v_int16x16 t1 = v_gather_chan(tmp, sx, c, 1);
+                        v_int16x16 d = v_mulhrs(t0 - t1, a0) + t1;
+                        v_pack_u_store(&dst[l][x], d);
+                    }
+                }
+
+                if (x < outSz.width) {
+                    x = outSz.width - half_nlanes;
+                }
+            }
+        }
+    }
+    return;
+}
+
 } // namespace avx2
 } // namespace fliud
 } // namespace gapi