mirror of
https://github.com/opencv/opencv.git
synced 2024-11-27 20:50:25 +08:00
Merge pull request #21728 from anna-khakimova:ak/resize_f32c1_avx_simd
GAPI Fluid: SIMD AVX2 Resize F32C1. * GAPI Fluid: Resize F32C1 scalar. * Final version * GAPI Fluid: SIMD AVX2 for Resize F32C1. * Applied comments. * Deleted warning suppression. * Applied comments.
This commit is contained in:
parent
9dd8e4df7f
commit
e5bdab0355
@ -25,6 +25,9 @@
|
||||
|
||||
#include "gfluidimgproc_func.hpp"
|
||||
|
||||
#if CV_AVX2
|
||||
#include "gfluidimgproc_simd_avx2.hpp"
|
||||
#endif
|
||||
#if CV_SSE4_1
|
||||
#include "gfluidcore_simd_sse41.hpp"
|
||||
#endif
|
||||
@ -2132,11 +2135,25 @@ CV_ALWAYS_INLINE void calcRowLinear(const cv::gapi::fluid::View& in,
|
||||
{
|
||||
auto index0 = mapsy[outY + l] - inY;
|
||||
auto index1 = mapsy[outSz.height + outY + l] - inY;
|
||||
|
||||
src0[l] = in.InLine<const float>(index0);
|
||||
src1[l] = in.InLine<const float>(index1);
|
||||
dst[l] = out.OutLine<float>(l);
|
||||
}
|
||||
|
||||
#if CV_AVX2
|
||||
// number floats in AVX2 SIMD vector.
|
||||
constexpr int nlanes = 8;
|
||||
|
||||
if (inSz.width >= nlanes && outSz.width >= nlanes)
|
||||
{
|
||||
avx2::calcRowLinear32FC1Impl(dst, src0, src1, alpha, mapsx, beta,
|
||||
inSz, outSz, lpi);
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // CV_AVX2
|
||||
|
||||
using alpha_type = typename Mapper::alpha_type;
|
||||
for (int l = 0; l < lpi; ++l)
|
||||
{
|
||||
@ -2150,6 +2167,7 @@ CV_ALWAYS_INLINE void calcRowLinear(const cv::gapi::fluid::View& in,
|
||||
auto alpha1 = saturate_cast<alpha_type>(unity - alpha[x]);
|
||||
auto sx0 = mapsx[x];
|
||||
auto sx1 = sx0 + 1;
|
||||
|
||||
float tmp0 = resize_main_calculation(b0, src0[l][sx0], b1, src1[l][sx0]);
|
||||
float tmp1 = resize_main_calculation(b0, src0[l][sx1], b1, src1[l][sx1]);
|
||||
dst[l][x] = resize_main_calculation(alpha0, tmp0, alpha1, tmp1);
|
||||
@ -2174,6 +2192,7 @@ GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::imgproc::GResize, true)
|
||||
GAPI_Assert((in.depth == CV_8U && in.chan == 3) ||
|
||||
(in.depth == CV_32F && in.chan == 1));
|
||||
GAPI_Assert(interp == cv::INTER_LINEAR);
|
||||
|
||||
int outSz_w;
|
||||
int outSz_h;
|
||||
if (outSz.width == 0 || outSz.height == 0)
|
||||
@ -2212,6 +2231,7 @@ GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::imgproc::GResize, true)
|
||||
GAPI_Assert((in.meta().depth == CV_8U && in.meta().chan == 3) ||
|
||||
(in.meta().depth == CV_32F && in.meta().chan == 1));
|
||||
GAPI_Assert(interp == cv::INTER_LINEAR);
|
||||
|
||||
const int channels = in.meta().chan;
|
||||
const int depth = in.meta().depth;
|
||||
|
||||
|
181
modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp
Normal file
181
modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp
Normal file
@ -0,0 +1,181 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
//
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
|
||||
#if !defined(GAPI_STANDALONE)
|
||||
|
||||
#include "opencv2/gapi/own/saturate.hpp"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#include "opencv2/core.hpp"
|
||||
|
||||
#include <opencv2/core/hal/intrin.hpp>
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
namespace cv {
|
||||
namespace gapi {
|
||||
namespace fluid {
|
||||
namespace avx2 {
|
||||
|
||||
CV_ALWAYS_INLINE void v_gather_pairs(const float src[], const int* mapsx,
|
||||
v_float32x8& low, v_float32x8& high)
|
||||
{
|
||||
low.val = _mm256_castsi256_ps(_mm256_setr_epi64x(*reinterpret_cast<const int64_t*>(&src[mapsx[0]]),
|
||||
*reinterpret_cast<const int64_t*>(&src[mapsx[1]]),
|
||||
*reinterpret_cast<const int64_t*>(&src[mapsx[2]]),
|
||||
*reinterpret_cast<const int64_t*>(&src[mapsx[3]])));
|
||||
high.val = _mm256_castsi256_ps(_mm256_setr_epi64x(*reinterpret_cast<const int64_t*>(&src[mapsx[4]]),
|
||||
*reinterpret_cast<const int64_t*>(&src[mapsx[5]]),
|
||||
*reinterpret_cast<const int64_t*>(&src[mapsx[6]]),
|
||||
*reinterpret_cast<const int64_t*>(&src[mapsx[7]])));
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void v_deinterleave(const v_float32x8& low, const v_float32x8& high,
|
||||
v_float32x8& even, v_float32x8& odd)
|
||||
{
|
||||
__m256 tmp0 = _mm256_unpacklo_ps(low.val, high.val);
|
||||
__m256 tmp1 = _mm256_unpackhi_ps(low.val, high.val);
|
||||
__m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
|
||||
__m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
|
||||
even.val = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(tmp2), 216 /*11011000*/));
|
||||
odd.val = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(tmp3), 216 /*11011000*/));
|
||||
}
|
||||
|
||||
// Resize (bi-linear, 32FC1)
|
||||
CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi)
|
||||
{
|
||||
bool xRatioEq1 = inSz.width == outSz.width;
|
||||
bool yRatioEq1 = inSz.height == outSz.height;
|
||||
|
||||
constexpr int nlanes = v_float32x8::nlanes;
|
||||
|
||||
if (!xRatioEq1 && !yRatioEq1)
|
||||
{
|
||||
for (int line = 0; line < lpi; ++line) {
|
||||
float beta0 = beta[line];
|
||||
float beta1 = 1 - beta0;
|
||||
v_float32x8 v_beta0 = v256_setall_f32(beta0);
|
||||
int x = 0;
|
||||
|
||||
v_float32x8 low1, high1, s00, s01;
|
||||
v_float32x8 low2, high2, s10, s11;
|
||||
for (; x <= outSz.width - nlanes; x += nlanes)
|
||||
{
|
||||
v_float32x8 alpha0 = v256_load(&alpha[x]);
|
||||
// v_float32 alpha1 = 1.f - alpha0;
|
||||
|
||||
v_gather_pairs(src0[line], &mapsx[x], low1, high1);
|
||||
v_deinterleave(low1, high1, s00, s01);
|
||||
|
||||
// v_float32 res0 = s00*alpha0 + s01*alpha1;
|
||||
v_float32x8 res0 = v_fma(s00 - s01, alpha0, s01);
|
||||
|
||||
v_gather_pairs(src1[line], &mapsx[x], low2, high2);
|
||||
v_deinterleave(low2, high2, s10, s11);
|
||||
|
||||
// v_float32 res1 = s10*alpha0 + s11*alpha1;
|
||||
v_float32x8 res1 = v_fma(s10 - s11, alpha0, s11);
|
||||
// v_float32 d = res0*beta0 + res1*beta1;
|
||||
v_float32x8 d = v_fma(res0 - res1, v_beta0, res1);
|
||||
|
||||
v_store(&dst[line][x], d);
|
||||
}
|
||||
|
||||
for (; x < outSz.width; ++x)
|
||||
{
|
||||
float alpha0 = alpha[x];
|
||||
float alpha1 = 1 - alpha0;
|
||||
int sx0 = mapsx[x];
|
||||
int sx1 = sx0 + 1;
|
||||
float res0 = src0[line][sx0] * alpha0 + src0[line][sx1] * alpha1;
|
||||
float res1 = src1[line][sx0] * alpha0 + src1[line][sx1] * alpha1;
|
||||
dst[line][x] = beta0 * res0 + beta1 * res1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!xRatioEq1)
|
||||
{
|
||||
|
||||
for (int line = 0; line < lpi; ++line) {
|
||||
int x = 0;
|
||||
|
||||
v_float32x8 low, high, s00, s01;
|
||||
for (; x <= outSz.width - nlanes; x += nlanes)
|
||||
{
|
||||
v_float32x8 alpha0 = v256_load(&alpha[x]);
|
||||
// v_float32 alpha1 = 1.f - alpha0;
|
||||
|
||||
v_gather_pairs(src0[line], &mapsx[x], low, high);
|
||||
v_deinterleave(low, high, s00, s01);
|
||||
|
||||
// v_float32 d = s00*alpha0 + s01*alpha1;
|
||||
v_float32x8 d = v_fma(s00 - s01, alpha0, s01);
|
||||
|
||||
v_store(&dst[line][x], d);
|
||||
}
|
||||
|
||||
for (; x < outSz.width; ++x) {
|
||||
float alpha0 = alpha[x];
|
||||
float alpha1 = 1 - alpha0;
|
||||
int sx0 = mapsx[x];
|
||||
int sx1 = sx0 + 1;
|
||||
dst[line][x] = src0[line][sx0] * alpha0 + src0[line][sx1] * alpha1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else if (!yRatioEq1)
|
||||
{
|
||||
int length = inSz.width; // == outSz.width
|
||||
|
||||
for (int line = 0; line < lpi; ++line) {
|
||||
float beta0 = beta[line];
|
||||
float beta1 = 1 - beta0;
|
||||
v_float32x8 v_beta0 = v256_setall_f32(beta0);
|
||||
int x = 0;
|
||||
|
||||
for (; x <= length - nlanes; x += nlanes)
|
||||
{
|
||||
v_float32x8 s0 = v256_load(&src0[line][x]);
|
||||
v_float32x8 s1 = v256_load(&src1[line][x]);
|
||||
|
||||
// v_float32 d = s0*beta0 + s1*beta1;
|
||||
v_float32x8 d = v_fma(s0 - s1, v_beta0, s1);
|
||||
|
||||
v_store(&dst[line][x], d);
|
||||
}
|
||||
|
||||
for (; x < length; ++x) {
|
||||
dst[line][x] = beta0 * src0[line][x] + beta1 * src1[line][x];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
int length = inSz.width; // == outSz.width
|
||||
memcpy(dst[0], src0[0], length * sizeof(float)*lpi);
|
||||
}
|
||||
}
|
||||
} // namespace avx2
|
||||
} // namespace fliud
|
||||
} // namespace gapi
|
||||
} // namespace cv
|
||||
#endif // !defined(GAPI_STANDALONE)
|
Loading…
Reference in New Issue
Block a user