Merge pull request #21728 from anna-khakimova:ak/resize_f32c1_avx_simd

GAPI Fluid: SIMD AVX2 Resize F32C1.

* GAPI Fluid: Resize F32C1 scalar.

* Final version

* GAPI Fluid: SIMD AVX2 for Resize F32C1.

* Applied comments.

* Deleted warning suppression.

* Applied comments.
This commit is contained in:
Anna Khakimova 2022-03-25 18:11:01 +03:00 committed by GitHub
parent 9dd8e4df7f
commit e5bdab0355
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 201 additions and 0 deletions

View File

@ -25,6 +25,9 @@
#include "gfluidimgproc_func.hpp"
#if CV_AVX2
#include "gfluidimgproc_simd_avx2.hpp"
#endif
#if CV_SSE4_1
#include "gfluidcore_simd_sse41.hpp"
#endif
@ -2132,11 +2135,25 @@ CV_ALWAYS_INLINE void calcRowLinear(const cv::gapi::fluid::View& in,
{
auto index0 = mapsy[outY + l] - inY;
auto index1 = mapsy[outSz.height + outY + l] - inY;
src0[l] = in.InLine<const float>(index0);
src1[l] = in.InLine<const float>(index1);
dst[l] = out.OutLine<float>(l);
}
#if CV_AVX2
// number floats in AVX2 SIMD vector.
constexpr int nlanes = 8;
if (inSz.width >= nlanes && outSz.width >= nlanes)
{
avx2::calcRowLinear32FC1Impl(dst, src0, src1, alpha, mapsx, beta,
inSz, outSz, lpi);
return;
}
#endif // CV_AVX2
using alpha_type = typename Mapper::alpha_type;
for (int l = 0; l < lpi; ++l)
{
@ -2150,6 +2167,7 @@ CV_ALWAYS_INLINE void calcRowLinear(const cv::gapi::fluid::View& in,
auto alpha1 = saturate_cast<alpha_type>(unity - alpha[x]);
auto sx0 = mapsx[x];
auto sx1 = sx0 + 1;
float tmp0 = resize_main_calculation(b0, src0[l][sx0], b1, src1[l][sx0]);
float tmp1 = resize_main_calculation(b0, src0[l][sx1], b1, src1[l][sx1]);
dst[l][x] = resize_main_calculation(alpha0, tmp0, alpha1, tmp1);
@ -2174,6 +2192,7 @@ GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::imgproc::GResize, true)
GAPI_Assert((in.depth == CV_8U && in.chan == 3) ||
(in.depth == CV_32F && in.chan == 1));
GAPI_Assert(interp == cv::INTER_LINEAR);
int outSz_w;
int outSz_h;
if (outSz.width == 0 || outSz.height == 0)
@ -2212,6 +2231,7 @@ GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::imgproc::GResize, true)
GAPI_Assert((in.meta().depth == CV_8U && in.meta().chan == 3) ||
(in.meta().depth == CV_32F && in.meta().chan == 1));
GAPI_Assert(interp == cv::INTER_LINEAR);
const int channels = in.meta().chan;
const int depth = in.meta().depth;

View File

@ -0,0 +1,181 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2022 Intel Corporation
#if !defined(GAPI_STANDALONE)
#include "opencv2/gapi/own/saturate.hpp"
#include <immintrin.h>
#include "opencv2/core.hpp"
#include <opencv2/core/hal/intrin.hpp>
#include <cstdint>
#include <cstring>
#include <algorithm>
#include <limits>
#include <vector>
namespace cv {
namespace gapi {
namespace fluid {
namespace avx2 {
CV_ALWAYS_INLINE void v_gather_pairs(const float src[], const int* mapsx,
v_float32x8& low, v_float32x8& high)
{
low.val = _mm256_castsi256_ps(_mm256_setr_epi64x(*reinterpret_cast<const int64_t*>(&src[mapsx[0]]),
*reinterpret_cast<const int64_t*>(&src[mapsx[1]]),
*reinterpret_cast<const int64_t*>(&src[mapsx[2]]),
*reinterpret_cast<const int64_t*>(&src[mapsx[3]])));
high.val = _mm256_castsi256_ps(_mm256_setr_epi64x(*reinterpret_cast<const int64_t*>(&src[mapsx[4]]),
*reinterpret_cast<const int64_t*>(&src[mapsx[5]]),
*reinterpret_cast<const int64_t*>(&src[mapsx[6]]),
*reinterpret_cast<const int64_t*>(&src[mapsx[7]])));
}
CV_ALWAYS_INLINE void v_deinterleave(const v_float32x8& low, const v_float32x8& high,
v_float32x8& even, v_float32x8& odd)
{
__m256 tmp0 = _mm256_unpacklo_ps(low.val, high.val);
__m256 tmp1 = _mm256_unpackhi_ps(low.val, high.val);
__m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
__m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
even.val = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(tmp2), 216 /*11011000*/));
odd.val = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(tmp3), 216 /*11011000*/));
}
// Resize (bi-linear, 32FC1)
CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size& inSz,
const Size& outSz,
const int lpi)
{
bool xRatioEq1 = inSz.width == outSz.width;
bool yRatioEq1 = inSz.height == outSz.height;
constexpr int nlanes = v_float32x8::nlanes;
if (!xRatioEq1 && !yRatioEq1)
{
for (int line = 0; line < lpi; ++line) {
float beta0 = beta[line];
float beta1 = 1 - beta0;
v_float32x8 v_beta0 = v256_setall_f32(beta0);
int x = 0;
v_float32x8 low1, high1, s00, s01;
v_float32x8 low2, high2, s10, s11;
for (; x <= outSz.width - nlanes; x += nlanes)
{
v_float32x8 alpha0 = v256_load(&alpha[x]);
// v_float32 alpha1 = 1.f - alpha0;
v_gather_pairs(src0[line], &mapsx[x], low1, high1);
v_deinterleave(low1, high1, s00, s01);
// v_float32 res0 = s00*alpha0 + s01*alpha1;
v_float32x8 res0 = v_fma(s00 - s01, alpha0, s01);
v_gather_pairs(src1[line], &mapsx[x], low2, high2);
v_deinterleave(low2, high2, s10, s11);
// v_float32 res1 = s10*alpha0 + s11*alpha1;
v_float32x8 res1 = v_fma(s10 - s11, alpha0, s11);
// v_float32 d = res0*beta0 + res1*beta1;
v_float32x8 d = v_fma(res0 - res1, v_beta0, res1);
v_store(&dst[line][x], d);
}
for (; x < outSz.width; ++x)
{
float alpha0 = alpha[x];
float alpha1 = 1 - alpha0;
int sx0 = mapsx[x];
int sx1 = sx0 + 1;
float res0 = src0[line][sx0] * alpha0 + src0[line][sx1] * alpha1;
float res1 = src1[line][sx0] * alpha0 + src1[line][sx1] * alpha1;
dst[line][x] = beta0 * res0 + beta1 * res1;
}
}
}
else if (!xRatioEq1)
{
for (int line = 0; line < lpi; ++line) {
int x = 0;
v_float32x8 low, high, s00, s01;
for (; x <= outSz.width - nlanes; x += nlanes)
{
v_float32x8 alpha0 = v256_load(&alpha[x]);
// v_float32 alpha1 = 1.f - alpha0;
v_gather_pairs(src0[line], &mapsx[x], low, high);
v_deinterleave(low, high, s00, s01);
// v_float32 d = s00*alpha0 + s01*alpha1;
v_float32x8 d = v_fma(s00 - s01, alpha0, s01);
v_store(&dst[line][x], d);
}
for (; x < outSz.width; ++x) {
float alpha0 = alpha[x];
float alpha1 = 1 - alpha0;
int sx0 = mapsx[x];
int sx1 = sx0 + 1;
dst[line][x] = src0[line][sx0] * alpha0 + src0[line][sx1] * alpha1;
}
}
}
else if (!yRatioEq1)
{
int length = inSz.width; // == outSz.width
for (int line = 0; line < lpi; ++line) {
float beta0 = beta[line];
float beta1 = 1 - beta0;
v_float32x8 v_beta0 = v256_setall_f32(beta0);
int x = 0;
for (; x <= length - nlanes; x += nlanes)
{
v_float32x8 s0 = v256_load(&src0[line][x]);
v_float32x8 s1 = v256_load(&src1[line][x]);
// v_float32 d = s0*beta0 + s1*beta1;
v_float32x8 d = v_fma(s0 - s1, v_beta0, s1);
v_store(&dst[line][x], d);
}
for (; x < length; ++x) {
dst[line][x] = beta0 * src0[line][x] + beta1 * src1[line][x];
}
}
}
else
{
int length = inSz.width; // == outSz.width
memcpy(dst[0], src0[0], length * sizeof(float)*lpi);
}
}
} // namespace avx2
} // namespace fliud
} // namespace gapi
} // namespace cv
#endif // !defined(GAPI_STANDALONE)