Merge pull request #21728 from anna-khakimova:ak/resize_f32c1_avx_simd

GAPI Fluid: SIMD AVX2 Resize F32C1. * GAPI Fluid: Resize F32C1 scalar. * Final version * GAPI Fluid: SIMD AVX2 for Resize F32C1. * Applied comments. * Deleted warning suppression. * Applied comments.
2024-11-27 20:50:25 +08:00 · 2022-03-25 18:11:01 +03:00 · 2022-03-25 18:11:01 +03:00 · e5bdab0355
commit e5bdab0355
parent 9dd8e4df7f
2 changed files with 201 additions and 0 deletions
--- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
@ -25,6 +25,9 @@

 #include "gfluidimgproc_func.hpp"

+#if CV_AVX2
+#include "gfluidimgproc_simd_avx2.hpp"
+#endif
 #if CV_SSE4_1
 #include "gfluidcore_simd_sse41.hpp"
 #endif
@ -2132,11 +2135,25 @@ CV_ALWAYS_INLINE void calcRowLinear(const cv::gapi::fluid::View& in,
    {
        auto index0 = mapsy[outY + l] - inY;
        auto index1 = mapsy[outSz.height + outY + l] - inY;
+
        src0[l] = in.InLine<const float>(index0);
        src1[l] = in.InLine<const float>(index1);
        dst[l] = out.OutLine<float>(l);
    }

+#if CV_AVX2
+    // number floats in AVX2 SIMD vector.
+    constexpr int nlanes = 8;
+
+    if (inSz.width >= nlanes && outSz.width >= nlanes)
+    {
+        avx2::calcRowLinear32FC1Impl(dst, src0, src1, alpha, mapsx, beta,
+                                     inSz, outSz, lpi);
+
+        return;
+    }
+#endif // CV_AVX2
+
    using alpha_type = typename Mapper::alpha_type;
    for (int l = 0; l < lpi; ++l)
    {
@ -2150,6 +2167,7 @@ CV_ALWAYS_INLINE void calcRowLinear(const cv::gapi::fluid::View& in,
            auto alpha1 = saturate_cast<alpha_type>(unity - alpha[x]);
            auto sx0 = mapsx[x];
            auto sx1 = sx0 + 1;
+
            float tmp0 = resize_main_calculation(b0, src0[l][sx0], b1, src1[l][sx0]);
            float tmp1 = resize_main_calculation(b0, src0[l][sx1], b1, src1[l][sx1]);
            dst[l][x] = resize_main_calculation(alpha0, tmp0, alpha1, tmp1);
@ -2174,6 +2192,7 @@ GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::imgproc::GResize, true)
       GAPI_Assert((in.depth == CV_8U && in.chan == 3) ||
                   (in.depth == CV_32F && in.chan == 1));
       GAPI_Assert(interp == cv::INTER_LINEAR);
+
       int outSz_w;
       int outSz_h;
       if (outSz.width == 0 || outSz.height == 0)
@ -2212,6 +2231,7 @@ GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::imgproc::GResize, true)
        GAPI_Assert((in.meta().depth == CV_8U && in.meta().chan == 3) ||
                    (in.meta().depth == CV_32F && in.meta().chan == 1));
        GAPI_Assert(interp == cv::INTER_LINEAR);
+
        const int channels = in.meta().chan;
        const int depth = in.meta().depth;

--- a/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp
@ -0,0 +1,181 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2022 Intel Corporation
+
+#if !defined(GAPI_STANDALONE)
+
+#include "opencv2/gapi/own/saturate.hpp"
+
+#include <immintrin.h>
+
+#include "opencv2/core.hpp"
+
+#include <opencv2/core/hal/intrin.hpp>
+
+#include <cstdint>
+#include <cstring>
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+namespace avx2 {
+
+CV_ALWAYS_INLINE void v_gather_pairs(const float src[], const int* mapsx,
+                                     v_float32x8& low, v_float32x8& high)
+{
+    low.val = _mm256_castsi256_ps(_mm256_setr_epi64x(*reinterpret_cast<const int64_t*>(&src[mapsx[0]]),
+                                                     *reinterpret_cast<const int64_t*>(&src[mapsx[1]]),
+                                                     *reinterpret_cast<const int64_t*>(&src[mapsx[2]]),
+                                                     *reinterpret_cast<const int64_t*>(&src[mapsx[3]])));
+    high.val = _mm256_castsi256_ps(_mm256_setr_epi64x(*reinterpret_cast<const int64_t*>(&src[mapsx[4]]),
+                                                      *reinterpret_cast<const int64_t*>(&src[mapsx[5]]),
+                                                      *reinterpret_cast<const int64_t*>(&src[mapsx[6]]),
+                                                      *reinterpret_cast<const int64_t*>(&src[mapsx[7]])));
+}
+
+CV_ALWAYS_INLINE void v_deinterleave(const v_float32x8& low, const v_float32x8& high,
+                                     v_float32x8& even,      v_float32x8& odd)
+{
+    __m256 tmp0 = _mm256_unpacklo_ps(low.val, high.val);
+    __m256 tmp1 = _mm256_unpackhi_ps(low.val, high.val);
+    __m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
+    __m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
+    even.val = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(tmp2), 216 /*11011000*/));
+    odd.val = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(tmp3), 216 /*11011000*/));
+}
+
+// Resize (bi-linear, 32FC1)
+CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(float *dst[],
+                                             const float *src0[],
+                                             const float *src1[],
+                                             const float  alpha[],
+                                             const int    mapsx[],
+                                             const float  beta[],
+                                             const Size& inSz,
+                                             const Size& outSz,
+                                             const int   lpi)
+{
+    bool xRatioEq1 = inSz.width == outSz.width;
+    bool yRatioEq1 = inSz.height == outSz.height;
+
+    constexpr int nlanes = v_float32x8::nlanes;
+
+    if (!xRatioEq1 && !yRatioEq1)
+    {
+        for (int line = 0; line < lpi; ++line) {
+            float beta0 = beta[line];
+            float beta1 = 1 - beta0;
+            v_float32x8 v_beta0 = v256_setall_f32(beta0);
+            int x = 0;
+
+            v_float32x8 low1, high1, s00, s01;
+            v_float32x8 low2, high2, s10, s11;
+            for (; x <= outSz.width - nlanes; x += nlanes)
+            {
+                v_float32x8 alpha0 = v256_load(&alpha[x]);
+                //  v_float32 alpha1 = 1.f - alpha0;
+
+                v_gather_pairs(src0[line], &mapsx[x], low1, high1);
+                v_deinterleave(low1, high1, s00, s01);
+
+                //  v_float32 res0 = s00*alpha0 + s01*alpha1;
+                v_float32x8 res0 = v_fma(s00 - s01, alpha0, s01);
+
+                v_gather_pairs(src1[line], &mapsx[x], low2, high2);
+                v_deinterleave(low2, high2, s10, s11);
+
+                //  v_float32 res1 = s10*alpha0 + s11*alpha1;
+                v_float32x8 res1 = v_fma(s10 - s11, alpha0, s11);
+                //  v_float32 d = res0*beta0 + res1*beta1;
+                v_float32x8 d = v_fma(res0 - res1, v_beta0, res1);
+
+                v_store(&dst[line][x], d);
+            }
+
+            for (; x < outSz.width; ++x)
+            {
+                float alpha0 = alpha[x];
+                float alpha1 = 1 - alpha0;
+                int   sx0 = mapsx[x];
+                int   sx1 = sx0 + 1;
+                float res0 = src0[line][sx0] * alpha0 + src0[line][sx1] * alpha1;
+                float res1 = src1[line][sx0] * alpha0 + src1[line][sx1] * alpha1;
+                dst[line][x] = beta0 * res0 + beta1 * res1;
+            }
+        }
+    }
+    else if (!xRatioEq1)
+    {
+
+        for (int line = 0; line < lpi; ++line) {
+            int x = 0;
+
+            v_float32x8 low, high, s00, s01;
+            for (; x <= outSz.width - nlanes; x += nlanes)
+            {
+                v_float32x8 alpha0 = v256_load(&alpha[x]);
+                //  v_float32 alpha1 = 1.f - alpha0;
+
+                v_gather_pairs(src0[line], &mapsx[x], low, high);
+                v_deinterleave(low, high, s00, s01);
+
+                //  v_float32 d = s00*alpha0 + s01*alpha1;
+                v_float32x8 d = v_fma(s00 - s01, alpha0, s01);
+
+                v_store(&dst[line][x], d);
+            }
+
+            for (; x < outSz.width; ++x) {
+                float alpha0 = alpha[x];
+                float alpha1 = 1 - alpha0;
+                int   sx0 = mapsx[x];
+                int   sx1 = sx0 + 1;
+                dst[line][x] = src0[line][sx0] * alpha0 + src0[line][sx1] * alpha1;
+            }
+        }
+
+    }
+    else if (!yRatioEq1)
+    {
+        int length = inSz.width;  // == outSz.width
+
+        for (int line = 0; line < lpi; ++line) {
+            float beta0 = beta[line];
+            float beta1 = 1 - beta0;
+            v_float32x8 v_beta0 = v256_setall_f32(beta0);
+            int x = 0;
+
+            for (; x <= length - nlanes; x += nlanes)
+            {
+                v_float32x8 s0 = v256_load(&src0[line][x]);
+                v_float32x8 s1 = v256_load(&src1[line][x]);
+
+                //  v_float32 d = s0*beta0 + s1*beta1;
+                v_float32x8 d = v_fma(s0 - s1, v_beta0, s1);
+
+                v_store(&dst[line][x], d);
+            }
+
+            for (; x < length; ++x) {
+                dst[line][x] = beta0 * src0[line][x] + beta1 * src1[line][x];
+            }
+        }
+
+    }
+    else
+    {
+        int length = inSz.width;  // == outSz.width
+        memcpy(dst[0], src0[0], length * sizeof(float)*lpi);
+    }
+}
+} // namespace avx2
+} // namespace fliud
+} // namespace gapi
+} // namespace cv
+#endif // !defined(GAPI_STANDALONE)