GAPI Fluid Resize:AVX2 implementation.

This commit is contained in:
Anna Khakimova 2022-03-01 11:11:49 +03:00
parent 13a995cc1d
commit abd237c67a
3 changed files with 533 additions and 8 deletions

View File

@ -497,17 +497,20 @@ namespace imgproc {
};
G_TYPED_KERNEL(GResize, <GMat(GMat,Size,double,double,int)>, "org.opencv.imgproc.transform.resize") {
static GMatDesc outMeta(GMatDesc in, Size sz, double fx, double fy, int /*interp*/) {
static GMatDesc outMeta(GMatDesc in, Size sz, double fx, double fy, int interp) {
GAPI_Assert(in.depth == CV_8U);
GAPI_Assert(in.chan == 3);
GAPI_Assert(interp == cv::INTER_LINEAR);
if (sz.width != 0 && sz.height != 0)
{
return in.withSize(sz);
return in.withType(CV_8U, 3).withSize(sz);
}
else
{
int outSz_w = saturate_cast<int>(in.size.width * fx);
int outSz_h = saturate_cast<int>(in.size.height * fy);
GAPI_Assert(outSz_w > 0 && outSz_h > 0);
return in.withSize(Size(outSz_w, outSz_h));
return in.withType(CV_8U, 3).withSize(Size(outSz_w, outSz_h));
}
}
};

View File

@ -2024,7 +2024,8 @@ struct Mapper {
template<typename T, class Mapper, int numChan>
CV_ALWAYS_INLINE void calcRowLinearC(const cv::gapi::fluid::View & in,
cv::gapi::fluid::Buffer& out,
cv::gapi::fluid::Buffer& scratch) {
cv::gapi::fluid::Buffer& scratch)
{
using alpha_type = typename Mapper::alpha_type;
auto inSz = in.meta().size;
@ -2057,10 +2058,26 @@ CV_ALWAYS_INLINE void calcRowLinearC(const cv::gapi::fluid::View & in,
dst[l] = out.OutLine<T>(l);
}
#if CV_SSE4_1
#if CV_SIMD
const auto* clone = scr.clone;
auto* tmp = scr.tmp;
#if CV_AVX2
if (inSz.width >= 32 && outSz.width >= 32)
{
avx2::calcRowLinear_8UC_Impl_(reinterpret_cast<uint8_t**>(dst),
reinterpret_cast<const uint8_t**>(src0),
reinterpret_cast<const uint8_t**>(src1),
reinterpret_cast<const short*>(alpha),
reinterpret_cast<const short*>(clone),
reinterpret_cast<const short*>(mapsx),
reinterpret_cast<const short*>(beta),
reinterpret_cast<uint8_t*>(tmp),
inSz, outSz, lpi);
return;
}
#endif // CV_AVX2
#if CV_SSE4_1
if (inSz.width >= 16 && outSz.width >= 16)
{
sse41::calcRowLinear_8UC_Impl_<numChan>(reinterpret_cast<uint8_t**>(dst),
@ -2076,20 +2093,21 @@ CV_ALWAYS_INLINE void calcRowLinearC(const cv::gapi::fluid::View & in,
return;
}
#endif // CV_SSE4_1
#endif // CV_SIMD
int length = out.length();
for (int l = 0; l < lpi; l++) {
for (int l = 0; l < lpi; ++l) {
constexpr static const auto unity = Mapper::unity;
auto beta0 = beta[l];
auto beta1 = saturate_cast<alpha_type>(unity - beta[l]);
for (int x = 0; x < length; x++) {
for (int x = 0; x < length; ++x) {
auto alpha0 = alpha[x];
auto alpha1 = saturate_cast<alpha_type>(unity - alpha[x]);
auto sx0 = mapsx[x];
auto sx1 = sx0 + 1;
for (int c = 0; c < numChan; c++) {
for (int c = 0; c < numChan; ++c) {
auto idx0 = numChan*sx0 + c;
auto idx1 = numChan*sx1 + c;
T tmp0 = resize_calc_revert_fixedpoint(beta0, src0[l][idx0], beta1, src1[l][idx0]);

View File

@ -174,6 +174,510 @@ CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(float *dst[],
memcpy(dst[0], src0[0], length * sizeof(float)*lpi);
}
}
CV_ALWAYS_INLINE void resize_horizontal_anyLPI(uint8_t* dst,
const uchar* src, const short mapsx[],
const short alpha[], const int width)
{
constexpr int nlanes = 16;
constexpr int chanNum = 3;
__m128i zero = _mm_setzero_si128();
for (int x = 0; width >= nlanes;)
{
for (; x <= width - nlanes; x += nlanes)
{
__m128i a012 = _mm_setr_epi16(alpha[x], alpha[x], alpha[x], alpha[x + 1],
alpha[x + 1], alpha[x + 1], alpha[x + 2], alpha[x + 2]);
__m128i a2345 = _mm_setr_epi16(alpha[x + 2], alpha[x + 3], alpha[x + 3], alpha[x + 3],
alpha[x + 4], alpha[x + 4], alpha[x + 4], alpha[x + 5]);
__m128i a567 = _mm_setr_epi16(alpha[x + 5], alpha[x + 5], alpha[x + 6], alpha[x + 6],
alpha[x + 6], alpha[x + 7], alpha[x + 7], alpha[x + 7]);
__m128i a8910 = _mm_setr_epi16(alpha[x + 8], alpha[x + 8], alpha[x + 8], alpha[x + 9],
alpha[x + 9], alpha[x + 9], alpha[x + 10], alpha[x + 10]);
__m128i a10111213 = _mm_setr_epi16(alpha[x + 10], alpha[x + 11], alpha[x + 11], alpha[x + 11],
alpha[x + 12], alpha[x + 12], alpha[x + 12], alpha[x + 13]);
__m128i a131415 = _mm_setr_epi16(alpha[x + 13], alpha[x + 13], alpha[x + 14], alpha[x + 14],
alpha[x + 14], alpha[x + 15], alpha[x + 15], alpha[x + 15]);
__m128i a1 = _mm_setr_epi8(src[chanNum * (mapsx[x] + 0)], src[chanNum * (mapsx[x] + 0) + 1], src[chanNum * (mapsx[x] + 0) + 2],
src[chanNum * (mapsx[x + 1] + 0)], src[chanNum * (mapsx[x + 1] + 0) + 1], src[chanNum * (mapsx[x + 1] + 0) + 2],
src[chanNum * (mapsx[x + 2] + 0)], src[chanNum * (mapsx[x + 2] + 0) + 1], src[chanNum * (mapsx[x + 2] + 0) + 2],
src[chanNum * (mapsx[x + 3] + 0)], src[chanNum * (mapsx[x + 3] + 0) + 1], src[chanNum * (mapsx[x + 3] + 0) + 2],
src[chanNum * (mapsx[x + 4] + 0)], src[chanNum * (mapsx[x + 4] + 0) + 1], src[chanNum * (mapsx[x + 4] + 0) + 2],
src[chanNum * (mapsx[x + 5] + 0)]);
__m128i b1 = _mm_setr_epi8(src[chanNum * (mapsx[x] + 1)], src[chanNum * (mapsx[x] + 1) + 1], src[chanNum * (mapsx[x] + 1) + 2],
src[chanNum * (mapsx[x + 1] + 1)], src[chanNum * (mapsx[x + 1] + 1) + 1], src[chanNum * (mapsx[x + 1] + 1) + 2],
src[chanNum * (mapsx[x + 2] + 1)], src[chanNum * (mapsx[x + 2] + 1) + 1], src[chanNum * (mapsx[x + 2] + 1) + 2],
src[chanNum * (mapsx[x + 3] + 1)], src[chanNum * (mapsx[x + 3] + 1) + 1], src[chanNum * (mapsx[x + 3] + 1) + 2],
src[chanNum * (mapsx[x + 4] + 1)], src[chanNum * (mapsx[x + 4] + 1) + 1], src[chanNum * (mapsx[x + 4] + 1) + 2],
src[chanNum * (mapsx[x + 5] + 1)]);
__m128i a2 = _mm_setr_epi8(src[chanNum * (mapsx[x + 5] + 0) + 1], src[chanNum * (mapsx[x + 5] + 0) + 2], src[chanNum * (mapsx[x + 6] + 0)],
src[chanNum * (mapsx[x + 6] + 0) + 1], src[chanNum * (mapsx[x + 6] + 0) + 2], src[chanNum * (mapsx[x + 7] + 0)],
src[chanNum * (mapsx[x + 7] + 0) + 1], src[chanNum * (mapsx[x + 7] + 0) + 2], src[chanNum * (mapsx[x + 8] + 0)],
src[chanNum * (mapsx[x + 8] + 0) + 1], src[chanNum * (mapsx[x + 8] + 0) + 2], src[chanNum * (mapsx[x + 9] + 0)],
src[chanNum * (mapsx[x + 9] + 0) + 1], src[chanNum * (mapsx[x + 9] + 0) + 2], src[chanNum * (mapsx[x + 10] + 0)],
src[chanNum * (mapsx[x + 10] + 0) + 1]);
__m128i b2 = _mm_setr_epi8(src[chanNum * (mapsx[x + 5] + 1) + 1], src[chanNum * (mapsx[x + 5] + 1) + 2], src[chanNum * (mapsx[x + 6] + 1)],
src[chanNum * (mapsx[x + 6] + 1) + 1], src[chanNum * (mapsx[x + 6] + 1) + 2], src[chanNum * (mapsx[x + 7] + 1)],
src[chanNum * (mapsx[x + 7] + 1) + 1], src[chanNum * (mapsx[x + 7] + 1) + 2], src[chanNum * (mapsx[x + 8] + 1)],
src[chanNum * (mapsx[x + 8] + 1) + 1], src[chanNum * (mapsx[x + 8] + 1) + 2], src[chanNum * (mapsx[x + 9] + 1)],
src[chanNum * (mapsx[x + 9] + 1) + 1], src[chanNum * (mapsx[x + 9] + 1) + 2], src[chanNum * (mapsx[x + 10] + 1)],
src[chanNum * (mapsx[x + 10] + 1) + 1]);
__m128i a3 = _mm_setr_epi8(src[chanNum * (mapsx[x + 10] + 0) + 2], src[chanNum * (mapsx[x + 11] + 0)], src[chanNum * (mapsx[x + 11] + 0) + 1],
src[chanNum * (mapsx[x + 11] + 0) + 2], src[chanNum * (mapsx[x + 12] + 0)], src[chanNum * (mapsx[x + 12] + 0) + 1],
src[chanNum * (mapsx[x + 12] + 0) + 2], src[chanNum * (mapsx[x + 13] + 0)], src[chanNum * (mapsx[x + 13] + 0) + 1],
src[chanNum * (mapsx[x + 13] + 0) + 2], src[chanNum * (mapsx[x + 14] + 0)], src[chanNum * (mapsx[x + 14] + 0) + 1],
src[chanNum * (mapsx[x + 14] + 0) + 2], src[chanNum * (mapsx[x + 15] + 0)], src[chanNum * (mapsx[x + 15] + 0) + 1],
src[chanNum * (mapsx[x + 15] + 0) + 2]);
__m128i b3 = _mm_setr_epi8(src[chanNum * (mapsx[x + 10] + 1) + 2], src[chanNum * (mapsx[x + 11] + 1)], src[chanNum * (mapsx[x + 11] + 1) + 1],
src[chanNum * (mapsx[x + 11] + 1) + 2], src[chanNum * (mapsx[x + 12] + 1)], src[chanNum * (mapsx[x + 12] + 1) + 1],
src[chanNum * (mapsx[x + 12] + 1) + 2], src[chanNum * (mapsx[x + 13] + 1)], src[chanNum * (mapsx[x + 13] + 1) + 1],
src[chanNum * (mapsx[x + 13] + 1) + 2], src[chanNum * (mapsx[x + 14] + 1)], src[chanNum * (mapsx[x + 14] + 1) + 1],
src[chanNum * (mapsx[x + 14] + 1) + 2], src[chanNum * (mapsx[x + 15] + 1)], src[chanNum * (mapsx[x + 15] + 1) + 1],
src[chanNum * (mapsx[x + 15] + 1) + 2]);
__m128i a11 = _mm_unpacklo_epi8(a1, zero);
__m128i a12 = _mm_unpackhi_epi8(a1, zero);
__m128i a21 = _mm_unpacklo_epi8(a2, zero);
__m128i a22 = _mm_unpackhi_epi8(a2, zero);
__m128i a31 = _mm_unpacklo_epi8(a3, zero);
__m128i a32 = _mm_unpackhi_epi8(a3, zero);
__m128i b11 = _mm_unpacklo_epi8(b1, zero);
__m128i b12 = _mm_unpackhi_epi8(b1, zero);
__m128i b21 = _mm_unpacklo_epi8(b2, zero);
__m128i b22 = _mm_unpackhi_epi8(b2, zero);
__m128i b31 = _mm_unpacklo_epi8(b3, zero);
__m128i b32 = _mm_unpackhi_epi8(b3, zero);
__m128i r1 = _mm_mulhrs_epi16(_mm_sub_epi16(a11, b11), a012);
__m128i r2 = _mm_mulhrs_epi16(_mm_sub_epi16(a12, b12), a2345);
__m128i r3 = _mm_mulhrs_epi16(_mm_sub_epi16(a21, b21), a567);
__m128i r4 = _mm_mulhrs_epi16(_mm_sub_epi16(a22, b22), a8910);
__m128i r5 = _mm_mulhrs_epi16(_mm_sub_epi16(a31, b31), a10111213);
__m128i r6 = _mm_mulhrs_epi16(_mm_sub_epi16(a32, b32), a131415);
__m128i r_1 = _mm_add_epi16(b11, r1);
__m128i r_2 = _mm_add_epi16(b12, r2);
__m128i r_3 = _mm_add_epi16(b21, r3);
__m128i r_4 = _mm_add_epi16(b22, r4);
__m128i r_5 = _mm_add_epi16(b31, r5);
__m128i r_6 = _mm_add_epi16(b32, r6);
__m128i res1 = _mm_packus_epi16(r_1, r_2);
__m128i res2 = _mm_packus_epi16(r_3, r_4);
__m128i res3 = _mm_packus_epi16(r_5, r_6);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[chanNum * x]), res1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[chanNum * x + 16]), res2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[chanNum * x + 32]), res3);
}
if (x < width) {
x = width - nlanes;
continue;
}
break;
}
}
CV_ALWAYS_INLINE void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[],
uint8_t tmp[], const short beta[],
const int& length)
{
constexpr int half_nlanes = (v_uint8::nlanes / 2);
GAPI_DbgAssert(length >= half_nlanes);
__m256i b0 = _mm256_set1_epi16(beta[0]);
__m256i b1 = _mm256_set1_epi16(beta[1]);
__m256i b2 = _mm256_set1_epi16(beta[2]);
__m256i b3 = _mm256_set1_epi16(beta[3]);
__m256i shuf_mask = _mm256_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13,
2, 10, 6, 14, 3, 11, 7, 15,
0, 8, 4, 12, 1, 9, 5, 13,
2, 10, 6, 14, 3, 11, 7, 15);
for (int w = 0; w < length; ) {
for (; w <= length - half_nlanes; w += half_nlanes)
{
__m256i val0_0 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src0[0][w])));
__m256i val0_1 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src0[1][w])));
__m256i val0_2 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src0[2][w])));
__m256i val0_3 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src0[3][w])));
__m256i val1_0 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src1[0][w])));
__m256i val1_1 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src1[1][w])));
__m256i val1_2 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src1[2][w])));
__m256i val1_3 = _mm256_cvtepu8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(&src1[3][w])));
__m256i t0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(val0_0, val1_0), b0);
__m256i t1 = _mm256_mulhrs_epi16(_mm256_sub_epi16(val0_1, val1_1), b1);
__m256i t2 = _mm256_mulhrs_epi16(_mm256_sub_epi16(val0_2, val1_2), b2);
__m256i t3 = _mm256_mulhrs_epi16(_mm256_sub_epi16(val0_3, val1_3), b3);
__m256i r0 = _mm256_add_epi16(val1_0, t0);
__m256i r1 = _mm256_add_epi16(val1_1, t1);
__m256i r2 = _mm256_add_epi16(val1_2, t2);
__m256i r3 = _mm256_add_epi16(val1_3, t3);
__m256i q0 = _mm256_packus_epi16(r0, r1);
__m256i q1 = _mm256_packus_epi16(r2, r3);
__m256i q2 = _mm256_blend_epi16(q0, _mm256_slli_si256(q1, 4), 0xCC /*0b11001100*/);
__m256i q3 = _mm256_blend_epi16(_mm256_srli_si256(q0, 4), q1, 0xCC /*0b11001100*/);
__m256i q4 = _mm256_shuffle_epi8(q2, shuf_mask);
__m256i q5 = _mm256_shuffle_epi8(q3, shuf_mask);
__m256i q6 = _mm256_permute2x128_si256(q4, q5, 0x20);
__m256i q7 = _mm256_permute2x128_si256(q4, q5, 0x31);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&tmp[4 * w + 0]), q6);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&tmp[4 * w + 2 * half_nlanes]), q7);
}
if (w < length)
{
w = length - half_nlanes;
}
}
}
CV_ALWAYS_INLINE void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
uint8_t tmp[], const int& beta0,
const int l, const int length1, const int length2) {
constexpr int half_nlanes = (v_uint8::nlanes / 2);
GAPI_DbgAssert(length1 >= half_nlanes);
for (int w = 0; w < length2; ) {
for (; w <= length1 - half_nlanes; w += half_nlanes) {
v_int16x16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
v_int16x16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
v_int16x16 t = v_mulhrs(s0 - s1, beta0) + s1;
v_pack_u_store(tmp + w, t);
}
if (w < length1) {
w = length1 - half_nlanes;
}
}
}
CV_ALWAYS_INLINE v_int16x16 v_gather_chan(const uchar src[], const v_int16x16& index, int channel, int pos)
{
constexpr int chanNum = 3;
v_int16x16 r;
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 0) + pos) + channel]), 0);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 1) + pos) + channel]), 1);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 2) + pos) + channel]), 2);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 3) + pos) + channel]), 3);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 4) + pos) + channel]), 4);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 5) + pos) + channel]), 5);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 6) + pos) + channel]), 6);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 7) + pos) + channel]), 7);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 8) + pos) + channel]), 8);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 9) + pos) + channel]), 9);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 10) + pos) + channel]), 10);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 11) + pos) + channel]), 11);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 12) + pos) + channel]), 12);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 13) + pos) + channel]), 13);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 14) + pos) + channel]), 14);
r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum * (_mm256_extract_epi16(index.val, 15) + pos) + channel]), 15);
return r;
}
CV_ALWAYS_INLINE void v_gather_channel(v_uint8x32& vec, const uint8_t tmp[], const short mapsx[],
int chanNum, int c, int x, int shift)
{
vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 0] + c)]), 0);
vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 1] + c)]), 1);
vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 2] + c)]), 2);
vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + shift + 3] + c)]), 3);
vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 0] + 1) + c)]), 4);
vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 1] + 1) + c)]), 5);
vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 2] + 1) + c)]), 6);
vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 3] + 1) + c)]), 7);
}
CV_ALWAYS_INLINE v_int16x16 v_mulhrs(const v_int16x16& a, const v_int16x16& b)
{
return v_int16x16(_mm256_mulhrs_epi16(a.val, b.val));
}
static inline v_int16x16 v_mulhrs(const v_int16x16& a, short b)
{
return v_mulhrs(a, v256_setall_s16(b));
}
CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl(uint8_t* dst[],
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short* clone, // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi)
{
constexpr int nlanes = 32; // number of 8-bit integers that fit into a 256-bit SIMD vector.
constexpr int half_nlanes = nlanes / 2;
constexpr int chanNum = 3;
if ((inSz.width * chanNum < half_nlanes) || (outSz.width < half_nlanes))
return;
const int shift = (half_nlanes / 4);
if (4 == lpi)
{
verticalPass_lpi4_8U(src0, src1, tmp, beta,
inSz.width*chanNum);
// horizontal pass
__m256i shuff_mask = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
__m256i perm_mask = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
constexpr int nproc_pixels = 5;
for (int x = 0;;)
{
for (; x <= outSz.width - (nproc_pixels + 1); x += nproc_pixels)
{
v_int16x16 a10 = vx_load(&clone[4 * x]);
v_int16x16 a32 = vx_load(&clone[4 * (x + 4)]);
v_int16x16 a54 = vx_load(&clone[4 * (x + 8)]);
v_int16x16 a76 = vx_load(&clone[4 * (x + 12)]);
__m128i pix1 = _mm_setzero_si128();
pix1 = _mm_insert_epi64(pix1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x]) ]), 0);
pix1 = _mm_insert_epi32(pix1, *reinterpret_cast<const int*>( &tmp[4 * (chanNum * mapsx[x]) + 8]), 2);
__m128i pix2 = _mm_setzero_si128();
pix2 = _mm_insert_epi64(pix2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x] + 1)) ]), 0);
pix2 = _mm_insert_epi32(pix2, *reinterpret_cast<const int*>( &tmp[4 * (chanNum * (mapsx[x] + 1)) + 8]), 2);
__m128i pix3 = _mm_setzero_si128();
pix3 = _mm_insert_epi64(pix3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 1]) ]), 0);
pix3 = _mm_insert_epi32(pix3, *reinterpret_cast<const int*>( &tmp[4 * (chanNum * mapsx[x + 1]) + 8]), 2);
__m128i pix4 = _mm_setzero_si128();
pix4 = _mm_insert_epi64(pix4, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 1] + 1)) ]), 0);
pix4 = _mm_insert_epi32(pix4, *reinterpret_cast<const int*>( &tmp[4 * (chanNum * (mapsx[x + 1] + 1)) + 8]), 2);
__m256i ext_pix1 = _mm256_cvtepi8_epi16(pix1);
__m256i ext_pix2 = _mm256_cvtepi8_epi16(pix2);
__m256i ext_pix3 = _mm256_cvtepi8_epi16(pix3);
__m256i ext_pix4 = _mm256_cvtepi8_epi16(pix4);
__m256i t0_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix1, ext_pix2), a00);
__m256i t1_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix3, ext_pix4), a11);
__m256i r0_0 = _mm256_add_epi16(ext_pix2, t0_0);
__m256i r1_0 = _mm256_add_epi16(ext_pix4, t1_0);
__m256i q0_0 = _mm256_packus_epi16(r0_0, r1_0);
__m256i perm64 = _mm256_permute4x64_epi64(q0_0, 216 /*11011000*/);
__m256i shuf = _mm256_shuffle_epi8(perm64, shuff_mask);
__m256i res1 = _mm256_permutevar8x32_epi32(shuf, perm_mask);
pix1 = _mm_insert_epi64(pix1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 2]) ]), 0);
pix1 = _mm_insert_epi32(pix1, *reinterpret_cast<const int*>( &tmp[4 * (chanNum * mapsx[x + 2]) + 8]), 2);
pix2 = _mm_insert_epi64(pix2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 2] + 1)) ]), 0);
pix2 = _mm_insert_epi32(pix2, *reinterpret_cast<const int*>( &tmp[4 * (chanNum * (mapsx[x + 2] + 1)) + 8]), 2);
pix3 = _mm_insert_epi64(pix3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 3]) ]), 0);
pix3 = _mm_insert_epi32(pix3, *reinterpret_cast<const int*>( &tmp[4 * (chanNum * mapsx[x + 3]) + 8]), 2);
pix4 = _mm_insert_epi64(pix4, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 3] + 1)) ]), 0);
pix4 = _mm_insert_epi32(pix4, *reinterpret_cast<const int*>( &tmp[4 * (chanNum * (mapsx[x + 3] + 1)) + 8]), 2);
ext_pix1 = _mm256_cvtepi8_epi16(pix1);
ext_pix2 = _mm256_cvtepi8_epi16(pix2);
ext_pix3 = _mm256_cvtepi8_epi16(pix3);
ext_pix4 = _mm256_cvtepi8_epi16(pix4);
t0_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix1, ext_pix2), a22);
t1_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix3, ext_pix4), a33);
r0_0 = _mm256_add_epi16(ext_pix2, t0_0);
r1_0 = _mm256_add_epi16(ext_pix4, t1_0);
q0_0 = _mm256_packus_epi16(r0_0, r1_0);
perm64 = _mm256_permute4x64_epi64(q0_0, 216 /*11011000*/);
shuf = _mm256_shuffle_epi8(perm64, shuff_mask);
__m256i res2 = _mm256_permutevar8x32_epi32(shuf, perm_mask);
pix1 = _mm_insert_epi64(pix1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 4])]), 0);
pix1 = _mm_insert_epi32(pix1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 4]) + 8]), 2);
pix2 = _mm_insert_epi64(pix2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 4] + 1))]), 0);
pix2 = _mm_insert_epi32(pix2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 4] + 1)) + 8]), 2);
pix3 = _mm_insert_epi64(pix3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 5])]), 0);
pix3 = _mm_insert_epi32(pix3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 5]) + 8]), 2);
pix4 = _mm_insert_epi64(pix4, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 5] + 1))]), 0);
pix4 = _mm_insert_epi32(pix4, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 5] + 1)) + 8]), 2);
ext_pix1 = _mm256_cvtepi8_epi16(pix1);
ext_pix2 = _mm256_cvtepi8_epi16(pix2);
ext_pix3 = _mm256_cvtepi8_epi16(pix3);
ext_pix4 = _mm256_cvtepi8_epi16(pix4);
t0_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix1, ext_pix2), a44);
t1_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix3, ext_pix4), a55);
r0_0 = _mm256_add_epi16(ext_pix2, t0_0);
r1_0 = _mm256_add_epi16(ext_pix4, t1_0);
q0_0 = _mm256_packus_epi16(r0_0, r1_0);
perm64 = _mm256_permute4x64_epi64(q0_0, 216 /*11011000*/);
shuf = _mm256_shuffle_epi8(perm64, shuff_mask);
__m256i res3 = _mm256_permutevar8x32_epi32(shuf, perm_mask);
pix1 = _mm_insert_epi64(pix1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 6])]), 0);
pix1 = _mm_insert_epi32(pix1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 6]) + 8]), 2);
pix2 = _mm_insert_epi64(pix2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 6] + 1))]), 0);
pix2 = _mm_insert_epi32(pix2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 6] + 1)) + 8]), 2);
pix3 = _mm_insert_epi64(pix3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 7])]), 0);
pix3 = _mm_insert_epi32(pix3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 7]) + 8]), 2);
pix4 = _mm_insert_epi64(pix4, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 7] + 1))]), 0);
pix4 = _mm_insert_epi32(pix4, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 7] + 1)) + 8]), 2);
ext_pix1 = _mm256_cvtepi8_epi16(pix1);
ext_pix2 = _mm256_cvtepi8_epi16(pix2);
ext_pix3 = _mm256_cvtepi8_epi16(pix3);
ext_pix4 = _mm256_cvtepi8_epi16(pix4);
t0_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix1, ext_pix2), a66);
t1_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix3, ext_pix4), a77);
r0_0 = _mm256_add_epi16(ext_pix2, t0_0);
r1_0 = _mm256_add_epi16(ext_pix4, t1_0);
q0_0 = _mm256_packus_epi16(r0_0, r1_0);
perm64 = _mm256_permute4x64_epi64(q0_0, 216 /*11011000*/);
shuf = _mm256_shuffle_epi8(perm64, shuff_mask);
__m256i res4 = _mm256_permutevar8x32_epi32(shuf, perm_mask);
pix1 = _mm_insert_epi64(pix1, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 8])]), 0);
pix1 = _mm_insert_epi32(pix1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 8]) + 8]), 2);
pix2 = _mm_insert_epi64(pix2, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 8] + 1))]), 0);
pix2 = _mm_insert_epi32(pix2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 8] + 1)) + 8]), 2);
pix3 = _mm_insert_epi64(pix3, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * mapsx[x + 9])]), 0);
pix3 = _mm_insert_epi32(pix3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * mapsx[x + 9]) + 8]), 2);
pix4 = _mm_insert_epi64(pix4, *reinterpret_cast<const int64_t*>(&tmp[4 * (chanNum * (mapsx[x + 9] + 1))]), 0);
pix4 = _mm_insert_epi32(pix4, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 9] + 1)) + 8]), 2);
ext_pix1 = _mm256_cvtepi8_epi16(pix1);
ext_pix2 = _mm256_cvtepi8_epi16(pix2);
ext_pix3 = _mm256_cvtepi8_epi16(pix3);
ext_pix4 = _mm256_cvtepi8_epi16(pix4);
t0_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix1, ext_pix2), a88);
t1_0 = _mm256_mulhrs_epi16(_mm256_sub_epi16(ext_pix3, ext_pix4), a99);
r0_0 = _mm256_add_epi16(ext_pix2, t0_0);
r1_0 = _mm256_add_epi16(ext_pix4, t1_0);
q0_0 = _mm256_packus_epi16(r0_0, r1_0);
perm64 = _mm256_permute4x64_epi64(q0_0, 216 /*11011000*/);
shuf = _mm256_shuffle_epi8(perm64, shuff_mask);
__m256i res5 = _mm256_permutevar8x32_epi32(shuf, perm_mask);
__m256i bl1 = _mm256_blend_epi16(res1, _mm256_slli_si256(res2, 8), 240 /*0b11110000*/);
__m256i bl2 = _mm256_blend_epi16(_mm256_srli_si256(res1, 8), res2, 240 /*0b11110000*/);
__m256i bl3 = _mm256_blend_epi16(res3, _mm256_slli_si256(res4, 8), 240 /*0b11110000*/);
__m256i bl4 = _mm256_blend_epi16(_mm256_srli_si256(res3, 8), res4, 240 /*0b11110000*/);
__m256i perm1 = _mm256_permute2x128_si256(bl1, bl3, 32);
__m256i perm1 = _mm256_permute2x128_si256(bl2, bl4, 32);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[0][chanNum * x]), bl1);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[1][chanNum * x]), bl2);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[2][chanNum * x]), bl3);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[3][chanNum * x]), bl4);
}
for (; x < outSz.width; ++x)
{
constexpr static const int ONE = 1 << 15;
constexpr static const int half = 1 << 14;
auto alpha0 = alpha[x];
auto alpha1 = saturate_cast<short>(ONE - alpha[x]);
for (int c = 0; c < chanNum; ++c)
{
dst[0][chanNum * x + c] = (tmp[4 * (chanNum * mapsx[x] + c) ] * alpha0 +
tmp[4 * (chanNum * (mapsx[x] + 1) + c) ] * alpha1 + half) >> 15;
dst[1][chanNum * x + c] = (tmp[4 * (chanNum * mapsx[x] + c) + 1] * alpha0 +
tmp[4 * (chanNum * (mapsx[x] + 1) + c) + 1] * alpha1 + half) >> 15;
dst[2][chanNum * x + c] = (tmp[4 * (chanNum * mapsx[x] + c) + 2] * alpha0 +
tmp[4 * (chanNum * (mapsx[x] + 1) + c) + 2] * alpha1 + half) >> 15;
dst[3][chanNum * x + c] = (tmp[4 * (chanNum * mapsx[x] + c) + 3] * alpha0 +
tmp[4 * (chanNum * (mapsx[x] + 1) + c) + 3] * alpha1 + half) >> 15;
}
}
break;
}
}
else
{ // if any lpi
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
// vertical pass
verticalPass_anylpi_8U(src0, src1, tmp, beta0, l,
inSz.width*chanNum, inSz.width*chanNum);
// horizontal pass
for (int x = 0; x < outSz.width; ) {
for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
for (int c = 0; c < chanNum; ++c) {
v_int16x16 a0 = v256_load(&alpha[x]); // as signed Q1.1.14
v_int16x16 sx = v256_load(&mapsx[x]); // as integer (int16)
v_int16x16 t0 = v_gather_chan(tmp, sx, c, 0);
v_int16x16 t1 = v_gather_chan(tmp, sx, c, 1);
v_int16x16 d = v_mulhrs(t0 - t1, a0) + t1;
v_pack_u_store(&dst[l][x], d);
}
}
if (x < outSz.width) {
x = outSz.width - half_nlanes;
}
}
}
}
return;
}
} // namespace avx2
} // namespace fliud
} // namespace gapi