mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 06:03:15 +08:00
sse2 version of resize area fast for types CV_(8, 16)UC(1, 3, 4)
This commit is contained in:
parent
67ce03d7dd
commit
d1ca934115
@ -71,7 +71,7 @@ typedef TestBaseWithParam<MatInfo_Size_Scale_t> MatInfo_Size_Scale;
|
||||
|
||||
PERF_TEST_P(MatInfo_Size_Scale, ResizeAreaFast,
|
||||
testing::Combine(
|
||||
testing::Values(CV_8UC1, CV_8UC4),
|
||||
testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4),
|
||||
testing::Values(szVGA, szqHD, sz720p, sz1080p),
|
||||
testing::Values(2)
|
||||
)
|
||||
|
@ -1241,16 +1241,163 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
|
||||
template <typename T, typename WT>
|
||||
struct ResizeAreaFastNoVec
|
||||
{
|
||||
ResizeAreaFastNoVec(int /*_scale_x*/, int /*_scale_y*/,
|
||||
int /*_cn*/, int /*_step*//*, const int**/ /*_ofs*/) { }
|
||||
int operator() (const T* /*S*/, T* /*D*/, int /*w*/) const { return 0; }
|
||||
ResizeAreaFastNoVec(int, int) { }
|
||||
ResizeAreaFastNoVec(int, int, int, int) { }
|
||||
int operator() (const T*, T*, int) const
|
||||
{ return 0; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
#if CV_SSE2
|
||||
class ResizeAreaFastVec_SIMD_8u
|
||||
{
|
||||
public:
|
||||
ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
|
||||
cn(_cn), step(_step)
|
||||
{
|
||||
use_simd = checkHardwareSupport(CV_CPU_SSE2);
|
||||
}
|
||||
|
||||
int operator() (const uchar* S, uchar* D, int w) const
|
||||
{
|
||||
if (!use_simd)
|
||||
return 0;
|
||||
|
||||
int dx = 0;
|
||||
const uchar* S0 = S;
|
||||
const uchar* S1 = S0 + step;
|
||||
__m128i masklow = _mm_set1_epi16(0x00ff);
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
|
||||
if (cn == 1)
|
||||
{
|
||||
for ( ; dx < w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
|
||||
{
|
||||
__m128i s0 = _mm_loadu_si128((const __m128i*)S0);
|
||||
__m128i s1 = _mm_loadu_si128((const __m128i*)S1);
|
||||
|
||||
__m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 1));
|
||||
s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 1)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)D, _mm_packus_epi16(_mm_and_si128(s, masklow), zero));
|
||||
}
|
||||
}
|
||||
else if (cn == 3)
|
||||
for ( ; dx < w - 6; dx += 6, S0 += 12, S1 += 12, D += 6)
|
||||
{
|
||||
__m128i s0 = _mm_loadu_si128((const __m128i*)S0);
|
||||
__m128i s1 = _mm_loadu_si128((const __m128i*)S1);
|
||||
|
||||
__m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 3));
|
||||
s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 3)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)D, s);
|
||||
_mm_storel_epi64((__m128i*)(D+3), _mm_srli_si128(s, 6));
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Assert(cn == 4);
|
||||
for ( ; dx < w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
|
||||
{
|
||||
__m128i s0 = _mm_loadu_si128((const __m128i*)S0);
|
||||
__m128i s1 = _mm_loadu_si128((const __m128i*)S1);
|
||||
|
||||
__m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 4));
|
||||
s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 4)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)D, s);
|
||||
_mm_storel_epi64((__m128i*)(D+4), _mm_srli_si128(s, 8));
|
||||
}
|
||||
}
|
||||
|
||||
return dx;
|
||||
}
|
||||
|
||||
private:
|
||||
int cn;
|
||||
int step;
|
||||
bool use_simd;
|
||||
};
|
||||
|
||||
class ResizeAreaFastVec_SIMD_16u
|
||||
{
|
||||
public:
|
||||
ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
|
||||
cn(_cn), step(_step)
|
||||
{
|
||||
use_simd = checkHardwareSupport(CV_CPU_SSE2);
|
||||
}
|
||||
|
||||
int operator() (const ushort* S, ushort* D, int w) const
|
||||
{
|
||||
if (!use_simd)
|
||||
return 0;
|
||||
|
||||
int dx = 0;
|
||||
const ushort* S0 = (const ushort*)S;
|
||||
const ushort* S1 = (const ushort*)(S0 + step);
|
||||
__m128i masklow = _mm_set1_epi32(0x0000ffff);
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
|
||||
if (cn == 1)
|
||||
{
|
||||
for ( ; dx < w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
|
||||
{
|
||||
__m128i s0 = _mm_loadu_si128((const __m128i*)S0);
|
||||
__m128i s1 = _mm_loadu_si128((const __m128i*)S1);
|
||||
|
||||
__m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 2));
|
||||
s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 2)));
|
||||
|
||||
s = _mm_and_si128(s, masklow);
|
||||
s = _mm_packs_epi32(s, zero);
|
||||
_mm_storel_epi64((__m128i*)D, s);
|
||||
}
|
||||
}
|
||||
else if (cn == 3)
|
||||
for ( ; dx < w - 3; dx += 3, S0 += 6, S1 += 6, D += 3)
|
||||
{
|
||||
__m128i s0 = _mm_loadu_si128((const __m128i*)S0);
|
||||
__m128i s1 = _mm_loadu_si128((const __m128i*)S1);
|
||||
|
||||
__m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 6));
|
||||
s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 6)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)D, s);
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Assert(cn == 4);
|
||||
for ( ; dx < w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
|
||||
{
|
||||
__m128i s0 = _mm_loadu_si128((const __m128i*)S0);
|
||||
__m128i s1 = _mm_loadu_si128((const __m128i*)S1);
|
||||
|
||||
__m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 8));
|
||||
s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 8)));
|
||||
|
||||
_mm_storel_epi64((__m128i*)(D), s);
|
||||
}
|
||||
}
|
||||
|
||||
return dx;
|
||||
}
|
||||
|
||||
private:
|
||||
int cn;
|
||||
int step;
|
||||
bool use_simd;
|
||||
};
|
||||
|
||||
#else
|
||||
typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
|
||||
typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
|
||||
#endif
|
||||
|
||||
template<typename T, typename SIMDVecOp>
|
||||
struct ResizeAreaFastVec
|
||||
{
|
||||
ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step/*, const int* _ofs*/) :
|
||||
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)/*, ofs(_ofs)*/
|
||||
ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
|
||||
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
|
||||
{
|
||||
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
|
||||
}
|
||||
@ -1261,7 +1408,7 @@ struct ResizeAreaFastVec
|
||||
return 0;
|
||||
|
||||
const T* nextS = (const T*)((const uchar*)S + step);
|
||||
int dx = 0;
|
||||
int dx = vecOp(S, D, w);
|
||||
|
||||
if (cn == 1)
|
||||
for( ; dx < w; ++dx )
|
||||
@ -1279,7 +1426,7 @@ struct ResizeAreaFastVec
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(cn == 4);
|
||||
CV_Assert(cn == 4);
|
||||
for( ; dx < w; dx += 4 )
|
||||
{
|
||||
int index = dx*2;
|
||||
@ -1298,6 +1445,7 @@ private:
|
||||
int cn;
|
||||
bool fast_mode;
|
||||
int step;
|
||||
SIMDVecOp vecOp;
|
||||
};
|
||||
|
||||
template <typename T, typename WT, typename VecOp>
|
||||
@ -1702,10 +1850,10 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
|
||||
|
||||
static ResizeAreaFastFunc areafast_tab[] =
|
||||
{
|
||||
resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar> >,
|
||||
resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
|
||||
0,
|
||||
resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort> >,
|
||||
resizeAreaFast_<short, float, ResizeAreaFastVec<short> >,
|
||||
resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
|
||||
resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastNoVec<short, float> > >,
|
||||
0,
|
||||
resizeAreaFast_<float, float, ResizeAreaFastNoVec<float, float> >,
|
||||
resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
|
||||
@ -1764,9 +1912,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
|
||||
// in case of scale_x && scale_y is equal to 2
|
||||
// INTER_AREA (fast) also is equal to INTER_LINEAR
|
||||
if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
|
||||
{
|
||||
interpolation = INTER_AREA;
|
||||
}
|
||||
|
||||
// true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
|
||||
// In other cases it is emulated using some variant of bilinear interpolation
|
||||
|
Loading…
Reference in New Issue
Block a user