mirror of
https://github.com/opencv/opencv.git
synced 2025-01-12 15:49:32 +08:00
390 lines
20 KiB
C++
390 lines
20 KiB
C++
|
/*
|
||
|
* By downloading, copying, installing or using the software you agree to this license.
|
||
|
* If you do not agree to this license, do not download, install,
|
||
|
* copy or use the software.
|
||
|
*
|
||
|
*
|
||
|
* License Agreement
|
||
|
* For Open Source Computer Vision Library
|
||
|
* (3-clause BSD License)
|
||
|
*
|
||
|
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
|
||
|
* Third party copyrights are property of their respective owners.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without modification,
|
||
|
* are permitted provided that the following conditions are met:
|
||
|
*
|
||
|
* * Redistributions of source code must retain the above copyright notice,
|
||
|
* this list of conditions and the following disclaimer.
|
||
|
*
|
||
|
* * Redistributions in binary form must reproduce the above copyright notice,
|
||
|
* this list of conditions and the following disclaimer in the documentation
|
||
|
* and/or other materials provided with the distribution.
|
||
|
*
|
||
|
* * Neither the names of the copyright holders nor the names of the contributors
|
||
|
* may be used to endorse or promote products derived from this software
|
||
|
* without specific prior written permission.
|
||
|
*
|
||
|
* This software is provided by the copyright holders and contributors "as is" and
|
||
|
* any express or implied warranties, including, but not limited to, the implied
|
||
|
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||
|
* In no event shall copyright holders or contributors be liable for any direct,
|
||
|
* indirect, incidental, special, exemplary, or consequential damages
|
||
|
* (including, but not limited to, procurement of substitute goods or services;
|
||
|
* loss of use, data, or profits; or business interruption) however caused
|
||
|
* and on any theory of liability, whether in contract, strict liability,
|
||
|
* or tort (including negligence or otherwise) arising in any way out of
|
||
|
* the use of this software, even if advised of the possibility of such damage.
|
||
|
*/
|
||
|
|
||
|
#include "common.hpp"
|
||
|
#include "vtransform.hpp"
|
||
|
|
||
|
namespace CAROTENE_NS {
|
||
|
|
||
|
#define FILL_LINES2(macro,type) \
|
||
|
macro##_LINE(type,0) \
|
||
|
macro##_LINE(type,1)
|
||
|
#define FILL_LINES3(macro,type) \
|
||
|
FILL_LINES2(macro,type) \
|
||
|
macro##_LINE(type,2)
|
||
|
#define FILL_LINES4(macro,type) \
|
||
|
FILL_LINES3(macro,type) \
|
||
|
macro##_LINE(type,3)
|
||
|
|
||
|
#define FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride
|
||
|
|
||
|
#ifdef CAROTENE_NEON
|
||
|
|
||
|
#define VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);
|
||
|
#define PREF_LINE(type, n) internal::prefetch(src##n + sj);
|
||
|
#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);
|
||
|
#define PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);
|
||
|
#define VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);
|
||
|
#define SLD_LINE(type, n) dst[dj + n] = src##n[sj];
|
||
|
|
||
|
#define MUL2(val) (val << 1)
|
||
|
#define MUL3(val) (MUL2(val) + val)
|
||
|
#define MUL4(val) (val << 2)
|
||
|
|
||
|
#define CONTSRC2 dstStride == src0Stride && \
|
||
|
dstStride == src1Stride &&
|
||
|
#define CONTSRC3 dstStride == src0Stride && \
|
||
|
dstStride == src1Stride && \
|
||
|
dstStride == src2Stride &&
|
||
|
#define CONTSRC4 dstStride == src0Stride && \
|
||
|
dstStride == src1Stride && \
|
||
|
dstStride == src2Stride && \
|
||
|
dstStride == src3Stride &&
|
||
|
|
||
|
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
|
||
|
|
||
|
#define MERGE_ASM2(sgn, bits) __asm__ ( \
|
||
|
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
|
||
|
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
|
||
|
"vst2." #bits " {d0, d2}, [%[out0]] \n\t" \
|
||
|
"vst2." #bits " {d1, d3}, [%[out1]] \n\t" \
|
||
|
: \
|
||
|
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \
|
||
|
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \
|
||
|
: "d0","d1","d2","d3" \
|
||
|
);
|
||
|
#define MERGE_ASM3(sgn, bits) __asm__ ( \
|
||
|
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
|
||
|
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
|
||
|
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
|
||
|
"vst3." #bits " {d0, d2, d4}, [%[out0]] \n\t" \
|
||
|
"vst3." #bits " {d1, d3, d5}, [%[out1]] \n\t" \
|
||
|
: \
|
||
|
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \
|
||
|
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \
|
||
|
: "d0","d1","d2","d3","d4","d5" \
|
||
|
);
|
||
|
#define MERGE_ASM4(sgn, bits) __asm__ ( \
|
||
|
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
|
||
|
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
|
||
|
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
|
||
|
"vld1." #bits " {d6-d7}, [%[in3]] \n\t" \
|
||
|
"vst4." #bits " {d0, d2, d4, d6}, [%[out0]] \n\t" \
|
||
|
"vst4." #bits " {d1, d3, d5, d7}, [%[out1]] \n\t" \
|
||
|
: \
|
||
|
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \
|
||
|
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \
|
||
|
: "d0","d1","d2","d3","d4","d5","d6","d7" \
|
||
|
);
|
||
|
|
||
|
#define MERGE_QUAD(sgn, bits, n) { \
|
||
|
FILL_LINES##n(PREF, sgn##bits) \
|
||
|
MERGE_ASM##n(sgn, bits) \
|
||
|
}
|
||
|
|
||
|
#else
|
||
|
|
||
|
#define MERGE_QUAD(sgn, bits, n) { \
|
||
|
vec128 v_dst; \
|
||
|
/*FILL_LINES##n(PREF, sgn##bits) \
|
||
|
FILL_LINES##n(VLD1Q, sgn##bits)*/ \
|
||
|
FILL_LINES##n(PRLD, sgn##bits) \
|
||
|
vst##n##q_##sgn##bits(dst + dj, v_dst); \
|
||
|
}
|
||
|
|
||
|
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
|
||
|
|
||
|
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size \
|
||
|
FILL_LINES##n(FARG, sgn##bits), \
|
||
|
sgn##bits * dstBase, ptrdiff_t dstStride) \
|
||
|
{ \
|
||
|
internal::assertSupportedConfiguration(); \
|
||
|
Size2D size(_size); \
|
||
|
if (CONTSRC##n \
|
||
|
dstStride == (ptrdiff_t)(size.width)) \
|
||
|
{ \
|
||
|
size.width *= size.height; \
|
||
|
size.height = 1; \
|
||
|
} \
|
||
|
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
|
||
|
size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \
|
||
|
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
|
||
|
size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0; \
|
||
|
\
|
||
|
for (size_t i = 0u; i < size.height; ++i) \
|
||
|
{ \
|
||
|
FILL_LINES##n(VROW, sgn##bits) \
|
||
|
sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i); \
|
||
|
size_t sj = 0u, dj = 0u; \
|
||
|
\
|
||
|
for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits)) \
|
||
|
MERGE_QUAD(sgn, bits, n) \
|
||
|
\
|
||
|
if ( sj < roiw8 ) \
|
||
|
{ \
|
||
|
vec64 v_dst; \
|
||
|
FILL_LINES##n(VLD1, sgn##bits) \
|
||
|
vst##n##_##sgn##bits(dst + dj, v_dst); \
|
||
|
sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits); \
|
||
|
} \
|
||
|
\
|
||
|
for (; sj < size.width; ++sj, dj += n) \
|
||
|
{ \
|
||
|
FILL_LINES##n(SLD, sgn##bits) \
|
||
|
} \
|
||
|
} \
|
||
|
}
|
||
|
|
||
|
#define COMBINE64(sgn,n) void combine##n(const Size2D &_size \
|
||
|
FILL_LINES##n(FARG, sgn##64), \
|
||
|
sgn##64 * dstBase, ptrdiff_t dstStride) \
|
||
|
{ \
|
||
|
internal::assertSupportedConfiguration(); \
|
||
|
Size2D size(_size); \
|
||
|
if (CONTSRC##n \
|
||
|
dstStride == (ptrdiff_t)(size.width)) \
|
||
|
{ \
|
||
|
size.width *= size.height; \
|
||
|
size.height = 1; \
|
||
|
} \
|
||
|
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
|
||
|
\
|
||
|
for (size_t i = 0u; i < size.height; ++i) \
|
||
|
{ \
|
||
|
FILL_LINES##n(VROW, sgn##64) \
|
||
|
sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i); \
|
||
|
size_t sj = 0u, dj = 0u; \
|
||
|
\
|
||
|
for (; sj < size.width; ++sj, dj += n) \
|
||
|
{ \
|
||
|
vec64 v_dst; \
|
||
|
FILL_LINES##n(VLD1, sgn##64) \
|
||
|
vst##n##_##sgn##64(dst + dj, v_dst); \
|
||
|
/*FILL_LINES##n(SLD, sgn##64)*/ \
|
||
|
} \
|
||
|
} \
|
||
|
}
|
||
|
|
||
|
#else
|
||
|
|
||
|
#define VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;
|
||
|
|
||
|
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size \
|
||
|
FILL_LINES##n(FARG, sgn##bits), \
|
||
|
sgn##bits * dstBase, ptrdiff_t dstStride) \
|
||
|
{ \
|
||
|
internal::assertSupportedConfiguration(); \
|
||
|
(void)size; \
|
||
|
FILL_LINES##n(VOID, sgn##bits) \
|
||
|
(void)dstBase; \
|
||
|
(void)dstStride; \
|
||
|
}
|
||
|
#define COMBINE64(sgn,n) COMBINE(sgn,64,n)
|
||
|
|
||
|
#endif //CAROTENE_NEON
|
||
|
|
||
|
COMBINE(u, 8,2)
|
||
|
COMBINE(u, 8,3)
|
||
|
COMBINE(u, 8,4)
|
||
|
COMBINE(u,16,2)
|
||
|
COMBINE(u,16,3)
|
||
|
COMBINE(u,16,4)
|
||
|
COMBINE(s,32,2)
|
||
|
COMBINE(s,32,3)
|
||
|
COMBINE(s,32,4)
|
||
|
COMBINE64(s, 2)
|
||
|
COMBINE64(s, 3)
|
||
|
COMBINE64(s, 4)
|
||
|
|
||
|
void combineYUYV(const Size2D &size,
|
||
|
const u8 * srcyBase, ptrdiff_t srcyStride,
|
||
|
const u8 * srcuBase, ptrdiff_t srcuStride,
|
||
|
const u8 * srcvBase, ptrdiff_t srcvStride,
|
||
|
u8 * dstBase, ptrdiff_t dstStride)
|
||
|
{
|
||
|
internal::assertSupportedConfiguration();
|
||
|
#ifdef CAROTENE_NEON
|
||
|
#ifndef ANDROID
|
||
|
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
|
||
|
#endif
|
||
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||
|
|
||
|
for (size_t i = 0u; i < size.height; i += 1)
|
||
|
{
|
||
|
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
|
||
|
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
|
||
|
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
|
||
|
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||
|
size_t syj = 0u, sj = 0u, dj = 0u;
|
||
|
|
||
|
#ifndef ANDROID
|
||
|
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
|
||
|
{
|
||
|
internal::prefetch(srcy + syj);
|
||
|
internal::prefetch(srcu + sj);
|
||
|
internal::prefetch(srcv + sj);
|
||
|
|
||
|
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
|
||
|
uint8x16x4_t v_dst;
|
||
|
v_dst.val[0] = v_y.val[0];
|
||
|
v_dst.val[1] = vld1q_u8(srcu + sj);
|
||
|
v_dst.val[2] = v_y.val[1];
|
||
|
v_dst.val[3] = vld1q_u8(srcv + sj);
|
||
|
vst4q_u8(dst + dj, v_dst);
|
||
|
|
||
|
v_y = vld2q_u8(srcy + syj + 32);
|
||
|
v_dst.val[0] = v_y.val[0];
|
||
|
v_dst.val[1] = vld1q_u8(srcu + sj + 16);
|
||
|
v_dst.val[2] = v_y.val[1];
|
||
|
v_dst.val[3] = vld1q_u8(srcv + sj + 16);
|
||
|
vst4q_u8(dst + dj + 64, v_dst);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
|
||
|
{
|
||
|
uint8x8x2_t v_y = vld2_u8(srcy + syj);
|
||
|
uint8x8x4_t v_dst;
|
||
|
v_dst.val[0] = v_y.val[0];
|
||
|
v_dst.val[1] = vld1_u8(srcu + sj);
|
||
|
v_dst.val[2] = v_y.val[1];
|
||
|
v_dst.val[3] = vld1_u8(srcv + sj);
|
||
|
vst4_u8(dst + dj, v_dst);
|
||
|
}
|
||
|
|
||
|
for (; sj < size.width; ++sj, syj += 2, dj += 4)
|
||
|
{
|
||
|
dst[dj] = srcy[syj];
|
||
|
dst[dj + 1] = srcu[sj];
|
||
|
dst[dj + 2] = srcy[syj + 1];
|
||
|
dst[dj + 3] = srcv[sj];
|
||
|
}
|
||
|
}
|
||
|
#else
|
||
|
(void)size;
|
||
|
(void)srcyBase;
|
||
|
(void)srcyStride;
|
||
|
(void)srcuBase;
|
||
|
(void)srcuStride;
|
||
|
(void)srcvBase;
|
||
|
(void)srcvStride;
|
||
|
(void)dstBase;
|
||
|
(void)dstStride;
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
void combineUYVY(const Size2D &size,
|
||
|
const u8 * srcyBase, ptrdiff_t srcyStride,
|
||
|
const u8 * srcuBase, ptrdiff_t srcuStride,
|
||
|
const u8 * srcvBase, ptrdiff_t srcvStride,
|
||
|
u8 * dstBase, ptrdiff_t dstStride)
|
||
|
{
|
||
|
internal::assertSupportedConfiguration();
|
||
|
#ifdef CAROTENE_NEON
|
||
|
#ifndef ANDROID
|
||
|
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
|
||
|
#endif
|
||
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
||
|
|
||
|
for (size_t i = 0u; i < size.height; ++i)
|
||
|
{
|
||
|
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
|
||
|
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
|
||
|
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
|
||
|
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
||
|
size_t syj = 0u, sj = 0u, dj = 0u;
|
||
|
|
||
|
#ifndef ANDROID
|
||
|
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
|
||
|
{
|
||
|
internal::prefetch(srcy + syj);
|
||
|
internal::prefetch(srcu + sj);
|
||
|
internal::prefetch(srcv + sj);
|
||
|
|
||
|
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
|
||
|
uint8x16x4_t v_dst;
|
||
|
v_dst.val[0] = vld1q_u8(srcu + sj);
|
||
|
v_dst.val[1] = v_y.val[0];
|
||
|
v_dst.val[2] = vld1q_u8(srcv + sj);
|
||
|
v_dst.val[3] = v_y.val[1];
|
||
|
vst4q_u8(dst + dj, v_dst);
|
||
|
|
||
|
v_y = vld2q_u8(srcy + syj + 32);
|
||
|
v_dst.val[0] = vld1q_u8(srcu + sj + 16);
|
||
|
v_dst.val[1] = v_y.val[0];
|
||
|
v_dst.val[2] = vld1q_u8(srcv + sj + 16);
|
||
|
v_dst.val[3] = v_y.val[1];
|
||
|
vst4q_u8(dst + dj + 64, v_dst);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
|
||
|
{
|
||
|
uint8x8x2_t v_y = vld2_u8(srcy + syj);
|
||
|
uint8x8x4_t v_dst;
|
||
|
v_dst.val[0] = vld1_u8(srcu + sj);
|
||
|
v_dst.val[1] = v_y.val[0];
|
||
|
v_dst.val[2] = vld1_u8(srcv + sj);
|
||
|
v_dst.val[3] = v_y.val[1];
|
||
|
vst4_u8(dst + dj, v_dst);
|
||
|
}
|
||
|
|
||
|
for (; sj < size.width; ++sj, syj += 2, dj += 4)
|
||
|
{
|
||
|
dst[dj] = srcu[sj];
|
||
|
dst[dj + 1] = srcy[syj];
|
||
|
dst[dj + 2] = srcv[sj];
|
||
|
dst[dj + 3] = srcy[syj + 1];
|
||
|
}
|
||
|
}
|
||
|
#else
|
||
|
(void)size;
|
||
|
(void)srcyBase;
|
||
|
(void)srcyStride;
|
||
|
(void)srcuBase;
|
||
|
(void)srcuStride;
|
||
|
(void)srcvBase;
|
||
|
(void)srcvStride;
|
||
|
(void)dstBase;
|
||
|
(void)dstStride;
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
} // namespace CAROTENE_NS
|