Merge pull request #26505 from fengyuentau:imgproc/new_nearest_inter

imgproc: optimized nearest neighbour interpolation for warpAffine, warpPerspective and remap #26505

PR Description has a limit of 65536 characters. So performance stats are attached below.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Yuantao Feng 2024-11-30 15:41:21 +08:00 committed by GitHub
parent b7dacbd5e3
commit b476ed6d06
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 4026 additions and 1229 deletions

View File

@ -1761,6 +1761,35 @@ OPENCV_HAL_IMPL_RVV_PACK(v_int16, short, v_int32, 16, i16, i32, __riscv_vnclip,
OPENCV_HAL_IMPL_RVV_PACK_32(v_uint32, unsigned, v_uint64, 32, u32, u64, __riscv_vnclipu, __riscv_vnsrl)
OPENCV_HAL_IMPL_RVV_PACK_32(v_int32, int, v_int64, 32, i32, i64, __riscv_vnclip, __riscv_vnsra)
template <int N = VTraits<v_uint16>::max_nlanes>
inline v_uint16 v_pack(const v_uint32& a, const v_uint32& b)
{
ushort bufa[N];
ushort bufb[N];
v_pack_store(bufa, a);
v_pack_store(bufb, b);
ushort buf[N];
for (int i = 0; i < N; i++) {
buf[i] = bufa[i];
buf[i+N/2] = bufb[i];
}
return v_load(buf);
}
template <> inline v_uint16 v_pack<4>(const v_uint32& a, const v_uint32& b)
{
constexpr int N = VTraits<v_uint16>::max_nlanes;
ushort bufa[N];
ushort bufb[N];
v_pack_store(bufa, a);
v_pack_store(bufb, b);
ushort buf[N];
buf[0] = bufa[0]; buf[1] = bufa[1]; buf[2] = bufa[2]; buf[3] = bufa[3];
buf[4] = bufb[0]; buf[5] = bufb[1]; buf[6] = bufb[2]; buf[7] = bufb[3];
return v_load(buf);
}
#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, cast, hvl, vl) \
inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
{ \

View File

@ -111,48 +111,6 @@ PERF_TEST_P( TestWarpPerspective, WarpPerspective,
SANITY_CHECK(dst, 1);
}
PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear,
Combine(
Values( Size(640,480), Size(1920,1080), Size(2592,1944) ),
InterType::all(),
BorderMode::all(),
Values( CV_8UC1, CV_8UC4 )
)
)
{
Size size;
int borderMode, interType, type;
size = get<0>(GetParam());
interType = get<1>(GetParam());
borderMode = get<2>(GetParam());
type = get<3>(GetParam());
Scalar borderColor = Scalar::all(150);
Mat src(size, type), dst(size, type);
cvtest::fillGradient<uint8_t>(src);
if(borderMode == BORDER_CONSTANT) cvtest::smoothBorder<uint8_t>(src, borderColor, 1);
int shift = static_cast<int>(src.cols*0.04);
Mat srcVertices = (Mat_<Vec2f>(1, 4) << Vec2f(0, 0),
Vec2f(static_cast<float>(size.width-1), 0),
Vec2f(static_cast<float>(size.width-1), static_cast<float>(size.height-1)),
Vec2f(0, static_cast<float>(size.height-1)));
Mat dstVertices = (Mat_<Vec2f>(1, 4) << Vec2f(0, static_cast<float>(shift)),
Vec2f(static_cast<float>(size.width-shift/2), 0),
Vec2f(static_cast<float>(size.width-shift), static_cast<float>(size.height-shift)),
Vec2f(static_cast<float>(shift/2), static_cast<float>(size.height-1)));
Mat warpMat = getPerspectiveTransform(srcVertices, dstVertices);
declare.in(src).out(dst);
declare.time(100);
TEST_CYCLE()
{
warpPerspective( src, dst, warpMat, size, interType, borderMode, borderColor );
}
SANITY_CHECK(dst, 1);
}
PERF_TEST_P( TestRemap, map1_32fc1,
Combine(
Values( szVGA, sz1080p ),

View File

@ -1672,6 +1672,56 @@ void cv::remap( InputArray _src, OutputArray _dst,
int type = src.type(), depth = CV_MAT_DEPTH(type);
if (interpolation == INTER_NEAREST && map1.depth() == CV_32F) {
const auto *src_data = src.ptr<const uchar>();
auto *dst_data = dst.ptr<uchar>();
size_t src_step = src.step, dst_step = dst.step,
map1_step = map1.step, map2_step = map2.step;
int src_rows = src.rows, src_cols = src.cols;
int dst_rows = dst.rows, dst_cols = dst.cols;
const float *map1_data = map1.ptr<const float>();
const float *map2_data = map2.ptr<const float>();
switch (src.type()) {
case CV_8UC1: {
CV_CPU_DISPATCH(remapNearestInvoker_8UC1, (src_data, src_step, src_rows, src_cols, dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_8UC3: {
CV_CPU_DISPATCH(remapNearestInvoker_8UC3, (src_data, src_step, src_rows, src_cols, dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_8UC4: {
CV_CPU_DISPATCH(remapNearestInvoker_8UC4, (src_data, src_step, src_rows, src_cols, dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC1: {
CV_CPU_DISPATCH(remapNearestInvoker_16UC1, ((const uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC3: {
CV_CPU_DISPATCH(remapNearestInvoker_16UC3, ((const uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC4: {
CV_CPU_DISPATCH(remapNearestInvoker_16UC4, ((const uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC1: {
CV_CPU_DISPATCH(remapNearestInvoker_32FC1, ((const float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC3: {
CV_CPU_DISPATCH(remapNearestInvoker_32FC3, ((const float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC4: {
CV_CPU_DISPATCH(remapNearestInvoker_32FC4, ((const float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
// no default
}
}
if (interpolation == INTER_LINEAR) {
if (map1.depth() == CV_32F) {
const auto *src_data = src.ptr<const uint8_t>();
@ -1708,27 +1758,27 @@ void cv::remap( InputArray _src, OutputArray _dst,
break;
}
case CV_16UC1: {
CV_CPU_DISPATCH(remapLinearInvoker_16UC1, ((uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
CV_CPU_DISPATCH(remapLinearInvoker_16UC1, ((const uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC3: {
CV_CPU_DISPATCH(remapLinearInvoker_16UC3, ((uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
CV_CPU_DISPATCH(remapLinearInvoker_16UC3, ((const uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC4: {
CV_CPU_DISPATCH(remapLinearInvoker_16UC4, ((uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
CV_CPU_DISPATCH(remapLinearInvoker_16UC4, ((const uint16_t*)src_data, src_step, src_rows, src_cols, (uint16_t*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC1: {
CV_CPU_DISPATCH(remapLinearInvoker_32FC1, ((float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
CV_CPU_DISPATCH(remapLinearInvoker_32FC1, ((const float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC3: {
CV_CPU_DISPATCH(remapLinearInvoker_32FC3, ((float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
CV_CPU_DISPATCH(remapLinearInvoker_32FC3, ((const float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC4: {
CV_CPU_DISPATCH(remapLinearInvoker_32FC4, ((float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
CV_CPU_DISPATCH(remapLinearInvoker_32FC4, ((const float*)src_data, src_step, src_rows, src_cols, (float*)dst_data, dst_step, dst_rows, dst_cols, borderType, borderValue.val, map1_data, map1_step, map2_data, map2_step, hasRelativeFlag), CV_CPU_DISPATCH_MODES_ALL);
break;
}
// no default
@ -2657,6 +2707,48 @@ static void warpAffine(int src_type,
Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
if (interpolation == INTER_NEAREST) {
switch (src_type) {
case CV_8UC1: {
CV_CPU_DISPATCH(warpAffineNearestInvoker_8UC1, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_8UC3: {
CV_CPU_DISPATCH(warpAffineNearestInvoker_8UC3, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_8UC4: {
CV_CPU_DISPATCH(warpAffineNearestInvoker_8UC4, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC1: {
CV_CPU_DISPATCH(warpAffineNearestInvoker_16UC1, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC3: {
CV_CPU_DISPATCH(warpAffineNearestInvoker_16UC3, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC4: {
CV_CPU_DISPATCH(warpAffineNearestInvoker_16UC4, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC1: {
CV_CPU_DISPATCH(warpAffineNearestInvoker_32FC1, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC3: {
CV_CPU_DISPATCH(warpAffineNearestInvoker_32FC3, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC4: {
CV_CPU_DISPATCH(warpAffineNearestInvoker_32FC4, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
// no default
}
}
if (interpolation == INTER_LINEAR) {
switch (src_type) {
case CV_8UC1: {
@ -3324,46 +3416,99 @@ static void warpPerspective(int src_type,
{
CALL_HAL(warpPerspective, cv_hal_warpPerspective, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
if (interpolation == INTER_NEAREST) {
switch (src_type) {
case CV_8UC1: {
CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_8UC1, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_8UC3: {
CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_8UC3, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_8UC4: {
CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_8UC4, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC1: {
CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_16UC1, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC3: {
CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_16UC3, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC4: {
CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_16UC4, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC1: {
CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_32FC1, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC3: {
CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_32FC3, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC4: {
CV_CPU_DISPATCH(warpPerspectiveNearestInvoker_32FC4, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
}
}
if (interpolation == INTER_LINEAR) {
switch (src_type) {
case CV_8UC1: {
if (hint == cv::ALGO_HINT_APPROX) {
CV_CPU_DISPATCH(warpPerspectiveLinearApproxInvoker_8UC1, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
} else {
CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_8UC1, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
}
case CV_8UC3: {
if (hint == cv::ALGO_HINT_APPROX) {
CV_CPU_DISPATCH(warpPerspectiveLinearApproxInvoker_8UC3, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
} else {
CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_8UC3, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
}
case CV_8UC4: {
if (hint == cv::ALGO_HINT_APPROX) {
CV_CPU_DISPATCH(warpPerspectiveLinearApproxInvoker_8UC4, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
} else {
CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_8UC4, (src_data, src_step, src_height, src_width, dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
}
case CV_16UC1: {
CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_16UC1, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC3: {
CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_16UC3, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_16UC4: {
CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_16UC4, ((const uint16_t*)src_data, src_step, src_height, src_width, (uint16_t*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC1: {
CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_32FC1, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC3: {
CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_32FC3, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
case CV_32FC4: {
CV_CPU_DISPATCH(warpPerspectiveLinearInvoker_32FC4, ((const float*)src_data, src_step, src_height, src_width, (float*)dst_data, dst_step, dst_height, dst_width, M, borderType, borderValue), CV_CPU_DISPATCH_MODES_ALL);
break;
}
// no default
}

View File

@ -152,41 +152,35 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
if (x < dst_cols)
{
T scalar = convertScalar(nVal);
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
#pragma unroll
for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
if (y < dst_rows)
for (int dy = y, dy1 = min(dst_rows, y + ROWS_PER_WI); dy < dy1; ++dy, map1_index += map1_step, map2_index += map2_step)
{
__global const float * map1 = (__global const float *)(map1ptr + map1_index);
__global const float * map2 = (__global const float *)(map2ptr + map2_index);
__global T * dst = (__global T *)(dstptr + dst_index);
int gx = convert_int_sat_rte(map1[0]);
int gy = convert_int_sat_rte(map2[0]);
float X0 = map1[0];
float Y0 = map2[0];
#if WARP_RELATIVE
gx += x;
gy += y;
X0 += x;
Y0 += dy;
#endif
if (NEED_EXTRAPOLATION(gx, gy))
{
#ifndef BORDER_CONSTANT
int2 gxy = (int2)(gx, gy);
#endif
T v;
EXTRAPOLATE(gxy, v)
storepix(v, dst);
}
else
{
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
storepix(loadpix((__global const T*)(srcptr + src_index)), dst);
int sx = convert_int_sat(rint(X0));
int sy = convert_int_sat(rint(Y0));
int2 map_data0 = (int2)(sx, sy);
T v0 = scalar;
if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) {
v0 = loadpix((__global const T *)(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset))));
} else {
EXTRAPOLATE(map_data0, v0);
}
int dst_index = mad24(dy, dst_step, mad24(x, TSIZE, dst_offset));
storepix(v0, dstptr + dst_index);
}
}
}
@ -202,36 +196,34 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
if (x < dst_cols)
{
T scalar = convertScalar(nVal);
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));
#pragma unroll
for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
map_index += map_step, dst_index += dst_step)
if (y < dst_rows)
for (int dy = y, dy1 = min(dst_rows, y + ROWS_PER_WI); dy < dy1; ++dy, map_index += map_step)
{
__global const float2 * map = (__global const float2 *)(mapptr + map_index);
__global T * dst = (__global T *)(dstptr + dst_index);
float2 map_data = map[0];
int2 gxy = convert_int2_sat_rte(map[0]);
float X0 = map_data.x;
float Y0 = map_data.y;
#if WARP_RELATIVE
gxy.x += x;
gxy.y += y;
X0 += x;
Y0 += dy;
#endif
int gx = gxy.x, gy = gxy.y;
int sx = convert_int_sat(rint(X0));
int sy = convert_int_sat(rint(Y0));
if (NEED_EXTRAPOLATION(gx, gy))
{
T v;
EXTRAPOLATE(gxy, v)
storepix(v, dst);
}
else
{
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
storepix(loadpix((__global const T *)(srcptr + src_index)), dst);
int2 map_data0 = (int2)(sx, sy);
T v0 = scalar;
if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) {
v0 = loadpix((__global const T *)(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset))));
} else {
EXTRAPOLATE(map_data0, v0);
}
int dst_index = mad24(dy, dst_step, mad24(x, TSIZE, dst_offset));
storepix(v0, dstptr + dst_index);
}
}
}

View File

@ -93,27 +93,25 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
if (dx < dst_cols)
{
int round_delta = (AB_SCALE >> 1);
float X0_ = fma(M[0], (CT)dx, M[2]);
float Y0_ = fma(M[3], (CT)dx, M[5]);
int X0_ = rint(M[0] * dx * AB_SCALE);
int Y0_ = rint(M[3] * dx * AB_SCALE);
int dst_index = mad24(dy0, dst_step, mad24(dx, pixsize, dst_offset));
for (int dy = dy0, dy1 = min(dst_rows, dy0 + ROWS_PER_WI); dy < dy1; ++dy, dst_index += dst_step)
for (int dy = dy0, dy1 = min(dst_rows, dy0 + ROWS_PER_WI); dy < dy1; ++dy)
{
int X0 = X0_ + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + round_delta;
int Y0 = Y0_ + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + round_delta;
float X0 = fma(M[1], (CT)dy, X0_);
float Y0 = fma(M[4], (CT)dy, Y0_);
short sx = convert_short_sat(X0 >> AB_BITS);
short sy = convert_short_sat(Y0 >> AB_BITS);
int sx = convert_int_sat(rint(X0));
int sy = convert_int_sat(rint(Y0));
T v0 = scalar;
if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)
{
int src_index = mad24(sy, src_step, mad24(sx, pixsize, src_offset));
storepix(loadpix(srcptr + src_index), dstptr + dst_index);
v0 = loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset)));
}
else
storepix(scalar, dstptr + dst_index);
int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
storepix(v0, dstptr + dst_index);
}
}
}

View File

@ -92,22 +92,21 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
if (dx < dst_cols && dy < dst_rows)
{
CT X0 = M[0] * dx + M[1] * dy + M[2];
CT Y0 = M[3] * dx + M[4] * dy + M[5];
CT W = M[6] * dx + M[7] * dy + M[8];
W = W != 0.0f ? 1.f / W : 0.0f;
short sx = convert_short_sat_rte(X0*W);
short sy = convert_short_sat_rte(Y0*W);
float W = fma(M[6], (CT)dx, fma(M[7], (CT)dy, M[8]));
float X0 = fma(M[0], (CT)dx, fma(M[1], (CT)dy, M[2])) / W;
float Y0 = fma(M[3], (CT)dx, fma(M[4], (CT)dy, M[5])) / W;
int dst_index = mad24(dy, dst_step, dx * pixsize + dst_offset);
int sx = convert_int_sat(rint(X0));
int sy = convert_int_sat(rint(Y0));
T v0 = scalar;
if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)
{
int src_index = mad24(sy, src_step, sx * pixsize + src_offset);
storepix(loadpix(srcptr + src_index), dstptr + dst_index);
v0 = loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset)));
}
else
storepix(scalar, dstptr + dst_index);
int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
storepix(v0, dstptr + dst_index);
}
}

View File

@ -3,57 +3,61 @@
// of this distribution and at http://opencv.org/license.html.
// Shuffle
#define CV_WARP_NEAREST_SCALAR_SHUFFLE_DEF(cn, dtype_reg) \
dtype_reg p00##cn;
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(cn, dtype_reg) \
dtype_reg p00##cn, p01##cn, p10##cn, p11##cn;
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_C1(dtype_reg, dtype_ptr) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(g, dtype_reg) \
#define CV_WARP_SCALAR_SHUFFLE_DEF_C1(inter, dtype_reg, dtype_ptr) \
CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(g, dtype_reg) \
const dtype_ptr *srcptr = src + srcstep * iy + ix;
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_C3(dtype_reg, dtype_ptr) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(r, dtype_reg) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(g, dtype_reg) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(b, dtype_reg) \
#define CV_WARP_SCALAR_SHUFFLE_DEF_C3(inter, dtype_reg, dtype_ptr) \
CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(r, dtype_reg) \
CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(g, dtype_reg) \
CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(b, dtype_reg) \
const dtype_ptr *srcptr = src + srcstep * iy + ix*3;
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_C4(dtype_reg, dtype_ptr) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(r, dtype_reg) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(g, dtype_reg) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(b, dtype_reg) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF(a, dtype_reg) \
#define CV_WARP_SCALAR_SHUFFLE_DEF_C4(inter, dtype_reg, dtype_ptr) \
CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(r, dtype_reg) \
CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(g, dtype_reg) \
CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(b, dtype_reg) \
CV_WARP_##inter##_SCALAR_SHUFFLE_DEF(a, dtype_reg) \
const dtype_ptr *srcptr = src + srcstep * iy + ix*4;
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_8U(CN) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_##CN(int, uint8_t)
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_16U(CN) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_##CN(int, uint16_t)
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_32F(CN) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_##CN(float, float)
#define CV_WARP_SCALAR_SHUFFLE_DEF_8U(INTER, CN) \
CV_WARP_SCALAR_SHUFFLE_DEF_##CN(INTER, int, uint8_t)
#define CV_WARP_SCALAR_SHUFFLE_DEF_16U(INTER, CN) \
CV_WARP_SCALAR_SHUFFLE_DEF_##CN(INTER, int, uint16_t)
#define CV_WARP_SCALAR_SHUFFLE_DEF_32F(INTER, CN) \
CV_WARP_SCALAR_SHUFFLE_DEF_##CN(INTER, float, float)
#define CV_WARP_NEAREST_SCALAR_SHUFFLE_LOAD(CN, cn, i) \
p00##CN = srcptr[i];
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(CN, cn, i) \
p00##CN = srcptr[i]; p01##CN = srcptr[i + cn]; \
p10##CN = srcptr[srcstep + i]; p11##CN = srcptr[srcstep + cn + i];
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_C1() \
CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(g, 1, 0)
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_C3() \
CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(r, 3, 0) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(g, 3, 1) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(b, 3, 2)
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_C4() \
CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(r, 4, 0) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(g, 4, 1) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(b, 4, 2) \
CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD(a, 4, 3)
#define CV_WARP_SCALAR_SHUFFLE_LOAD_C1(inter) \
CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(g, 1, 0)
#define CV_WARP_SCALAR_SHUFFLE_LOAD_C3(inter) \
CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(r, 3, 0) \
CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(g, 3, 1) \
CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(b, 3, 2)
#define CV_WARP_SCALAR_SHUFFLE_LOAD_C4(inter) \
CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(r, 4, 0) \
CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(g, 4, 1) \
CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(b, 4, 2) \
CV_WARP_##inter##_SCALAR_SHUFFLE_LOAD(a, 4, 3)
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C1() \
#define CV_WARP_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C1() \
dstptr[x] = bval[0];
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C3() \
#define CV_WARP_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C3() \
dstptr[x*3] = bval[0]; \
dstptr[x*3+1] = bval[1]; \
dstptr[x*3+2] = bval[2];
#define CV_WARP_LINEAR_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C4() \
#define CV_WARP_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_C4() \
dstptr[x*4] = bval[0]; \
dstptr[x*4+1] = bval[1]; \
dstptr[x*4+2] = bval[2]; \
dstptr[x*4+3] = bval[3];
#define CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_C1(dy, dx, pxy) \
#define CV_WARP_SCALAR_FETCH_PIXEL_C1(dy, dx, pxy) \
if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
size_t ofs = dy*srcstep + dx; \
pxy##g = srcptr[ofs]; \
@ -67,7 +71,7 @@
size_t glob_ofs = iy_*srcstep + ix_; \
pxy##g = src[glob_ofs]; \
}
#define CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_C3(dy, dx, pxy) \
#define CV_WARP_SCALAR_FETCH_PIXEL_C3(dy, dx, pxy) \
if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
size_t ofs = dy*srcstep + dx*3; \
pxy##r = srcptr[ofs]; \
@ -89,7 +93,7 @@
pxy##g = src[glob_ofs+1]; \
pxy##b = src[glob_ofs+2]; \
}
#define CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_C4(dy, dx, pxy) \
#define CV_WARP_SCALAR_FETCH_PIXEL_C4(dy, dx, pxy) \
if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
size_t ofs = dy*srcstep + dx*4; \
pxy##r = srcptr[ofs]; \
@ -115,83 +119,96 @@
pxy##b = src[glob_ofs+2]; \
pxy##a = src[glob_ofs+3]; \
}
#define CV_WARP_NEAREST_SCALAR_FETCH_PIXEL(CN) \
CV_WARP_SCALAR_FETCH_PIXEL_##CN(0, 0, p00)
#define CV_WARP_LINEAR_SCALAR_FETCH_PIXEL(CN) \
CV_WARP_SCALAR_FETCH_PIXEL_##CN(0, 0, p00) \
CV_WARP_SCALAR_FETCH_PIXEL_##CN(0, 1, p01) \
CV_WARP_SCALAR_FETCH_PIXEL_##CN(1, 0, p10) \
CV_WARP_SCALAR_FETCH_PIXEL_##CN(1, 1, p11)
#define CV_WARP_LINEAR_SCALAR_SHUFFLE(CN, DEPTH) \
#define CV_WARP_SCALAR_NEAREST_COMPUTE_COORD() \
int ix = cvRound(sx), iy = cvRound(sy);
#define CV_WARP_SCALAR_LINEAR_COMPUTE_COORD() \
int ix = cvFloor(sx), iy = cvFloor(sy); \
sx -= ix; sy -= iy; \
CV_WARP_LINEAR_SCALAR_SHUFFLE_DEF_##DEPTH(CN); \
sx -= ix; sy -= iy;
#define CV_WARP_SCALAR_SHUFFLE(INTER, CN, DEPTH) \
CV_WARP_SCALAR_##INTER##_COMPUTE_COORD() \
CV_WARP_SCALAR_SHUFFLE_DEF_##DEPTH(INTER, CN) \
if ((((unsigned)ix < (unsigned)(srccols-1)) & \
((unsigned)iy < (unsigned)(srcrows-1))) != 0) { \
CV_WARP_LINEAR_SCALAR_SHUFFLE_LOAD_##CN() \
CV_WARP_SCALAR_SHUFFLE_LOAD_##CN(INTER) \
} else { \
if ((border_type == BORDER_CONSTANT || border_type == BORDER_TRANSPARENT) && \
(((unsigned)(ix+1) >= (unsigned)(srccols+1))| \
((unsigned)(iy+1) >= (unsigned)(srcrows+1))) != 0) { \
if (border_type == BORDER_CONSTANT) { \
CV_WARP_LINEAR_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_##CN() \
CV_WARP_SCALAR_SHUFFLE_STORE_CONSTANT_BORDER_##CN() \
} \
continue; \
} \
CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_##CN(0, 0, p00); \
CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_##CN(0, 1, p01); \
CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_##CN(1, 0, p10); \
CV_WARP_LINEAR_SCALAR_FETCH_PIXEL_##CN(1, 1, p11); \
CV_WARP_##INTER##_SCALAR_FETCH_PIXEL(CN) \
}
// Linear interpolation calculation
#define CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(cn) \
#define CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(cn) \
float v0##cn = p00##cn + sx*(p01##cn - p00##cn); \
float v1##cn = p10##cn + sx*(p11##cn - p10##cn);
#define CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32_C1() \
CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(g)
#define CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32_C3() \
CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(r) \
CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(g) \
CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(b)
#define CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32_C4() \
CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(r) \
CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(g) \
CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(b) \
CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32(a)
#define CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32_C1() \
CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(g)
#define CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32_C3() \
CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(r) \
CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(g) \
CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(b)
#define CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32_C4() \
CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(r) \
CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(g) \
CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(b) \
CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32(a)
#define CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(cn) \
#define CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(cn) \
v0##cn += sy*(v1##cn - v0##cn);
#define CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32_C1() \
CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(g)
#define CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32_C3() \
CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(r) \
CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(g) \
CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(b)
#define CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32_C4() \
CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(r) \
CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(g) \
CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(b) \
CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32(a)
#define CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32_C1() \
CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(g)
#define CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32_C3() \
CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(r) \
CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(g) \
CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(b)
#define CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32_C4() \
CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(r) \
CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(g) \
CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(b) \
CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32(a)
#define CV_WARP_LINEAR_SCALAR_INTER_CALC_F32(CN) \
CV_WARP_LINEAR_SCALAR_INTER_CALC_ALPHA_F32_##CN() \
CV_WARP_LINEAR_SCALAR_INTER_CALC_BETA_F32_##CN()
#define CV_WARP_SCALAR_LINEAR_INTER_CALC_F32(CN) \
CV_WARP_SCALAR_LINEAR_INTER_CALC_ALPHA_F32_##CN() \
CV_WARP_SCALAR_LINEAR_INTER_CALC_BETA_F32_##CN()
// Store
#define CV_WARP_LINEAR_SCALAR_STORE_C1(dtype) \
dstptr[x] = saturate_cast<dtype>(v0g);
#define CV_WARP_LINEAR_SCALAR_STORE_C3(dtype) \
dstptr[x*3] = saturate_cast<dtype>(v0r); \
dstptr[x*3+1] = saturate_cast<dtype>(v0g); \
dstptr[x*3+2] = saturate_cast<dtype>(v0b);
#define CV_WARP_LINEAR_SCALAR_STORE_C4(dtype) \
dstptr[x*4] = saturate_cast<dtype>(v0r); \
dstptr[x*4+1] = saturate_cast<dtype>(v0g); \
dstptr[x*4+2] = saturate_cast<dtype>(v0b); \
dstptr[x*4+3] = saturate_cast<dtype>(v0a);
#define CV_WARP_LINEAR_SCALAR_STORE_8U(CN) \
CV_WARP_LINEAR_SCALAR_STORE_##CN(uint8_t)
#define CV_WARP_LINEAR_SCALAR_STORE_16U(CN) \
CV_WARP_LINEAR_SCALAR_STORE_##CN(uint16_t)
#define CV_WARP_LINEAR_SCALAR_STORE_32F(CN) \
CV_WARP_LINEAR_SCALAR_STORE_##CN(float)
#define CV_WARP_SCALAR_STORE_C1(dtype, var) \
dstptr[x] = saturate_cast<dtype>(var##g);
#define CV_WARP_SCALAR_STORE_C3(dtype, var) \
dstptr[x*3] = saturate_cast<dtype>(var##r); \
dstptr[x*3+1] = saturate_cast<dtype>(var##g); \
dstptr[x*3+2] = saturate_cast<dtype>(var##b);
#define CV_WARP_SCALAR_STORE_C4(dtype, var) \
dstptr[x*4] = saturate_cast<dtype>(var##r); \
dstptr[x*4+1] = saturate_cast<dtype>(var##g); \
dstptr[x*4+2] = saturate_cast<dtype>(var##b); \
dstptr[x*4+3] = saturate_cast<dtype>(var##a);
#define CV_WARP_SCALAR_STORE_8U(CN, var) \
CV_WARP_SCALAR_STORE_##CN(uint8_t, var)
#define CV_WARP_SCALAR_STORE_16U(CN, var) \
CV_WARP_SCALAR_STORE_##CN(uint16_t, var)
#define CV_WARP_SCALAR_STORE_32F(CN, var) \
CV_WARP_SCALAR_STORE_##CN(float, var)
#define CV_WARP_NEAREST_SCALAR_STORE(CN, DEPTH) \
CV_WARP_SCALAR_STORE_##DEPTH(CN, p00)
#define CV_WARP_LINEAR_SCALAR_STORE(CN, DEPTH) \
CV_WARP_LINEAR_SCALAR_STORE_##DEPTH(CN)
CV_WARP_SCALAR_STORE_##DEPTH(CN, v0)
#define CV_WARP_SCALAR_STORE(INTER, CN, DEPTH) \
CV_WARP_##INTER##_SCALAR_STORE(CN, DEPTH)

View File

@ -3,6 +3,26 @@
// of this distribution and at http://opencv.org/license.html.
// Shuffle (all pixels within image)
#define CV_WARP_NEAREST_VECTOR_SHUFFLE_ALLWITHIN_C1(dtype) \
for (int i = 0; i < uf; i++) { \
const dtype* srcptr = src + addr[i]; \
pixbuf[i] = srcptr[0];\
}
#define CV_WARP_NEAREST_VECTOR_SHUFFLE_ALLWITHIN_C3(dtype) \
for (int i = 0; i < uf; i++) { \
const dtype* srcptr = src + addr[i]; \
pixbuf[3*i] = srcptr[0];\
pixbuf[3*i + 1] = srcptr[1]; \
pixbuf[3*i + 2] = srcptr[2]; \
}
#define CV_WARP_NEAREST_VECTOR_SHUFFLE_ALLWITHIN_C4(dtype) \
for (int i = 0; i < uf; i++) { \
const dtype* srcptr = src + addr[i]; \
pixbuf[4*i] = srcptr[0];\
pixbuf[4*i + 1] = srcptr[1]; \
pixbuf[4*i + 2] = srcptr[2]; \
pixbuf[4*i + 3] = srcptr[3]; \
}
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_C1(dtype) \
for (int i = 0; i < uf; i++) { \
const dtype* srcptr = src + addr[i]; \
@ -47,18 +67,17 @@
pixbuf[i + uf*11] = srcptr[srcstep + 6]; \
pixbuf[i + uf*15] = srcptr[srcstep + 7]; \
}
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_8U(CN) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_##CN(uint8_t)
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_16U(CN) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_##CN(uint16_t)
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_32F(CN) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_##CN(float)
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(CN, DEPTH) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_##DEPTH(CN)
#define CV_WARP_VECTOR_SHUFFLE_ALLWITHIN_8U(INTER, CN) \
CV_WARP_##INTER##_VECTOR_SHUFFLE_ALLWITHIN_##CN(uint8_t)
#define CV_WARP_VECTOR_SHUFFLE_ALLWITHIN_16U(INTER, CN) \
CV_WARP_##INTER##_VECTOR_SHUFFLE_ALLWITHIN_##CN(uint16_t)
#define CV_WARP_VECTOR_SHUFFLE_ALLWITHIN_32F(INTER, CN) \
CV_WARP_##INTER##_VECTOR_SHUFFLE_ALLWITHIN_##CN(float)
#define CV_WARP_VECTOR_SHUFFLE_ALLWITHIN(INTER, CN, DEPTH) \
CV_WARP_VECTOR_SHUFFLE_ALLWITHIN_##DEPTH(INTER, CN)
// Shuffle (ARM NEON)
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \
#define CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \
uint8x8x4_t t00 = { \
vld1_u8(src + addr[0]), \
vld1_u8(src + addr[1]), \
@ -84,7 +103,7 @@
vld1_u8(src + addr[7] + srcstep) \
}; \
uint32x2_t p00_, p01_, p10_, p11_;
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(coords, cn) \
#define CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(coords, cn) \
p00_ = vreinterpret_u32_u8(vtbl4_u8(t00, coords)); \
p01_ = vreinterpret_u32_u8(vtbl4_u8(t01, coords)); \
p10_ = vreinterpret_u32_u8(vtbl4_u8(t10, coords)); \
@ -93,58 +112,58 @@
p01##cn = vreinterpret_u8_u32(vtrn2_u32(p00_, p01_)); \
p10##cn = vreinterpret_u8_u32(vtrn1_u32(p10_, p11_)); \
p11##cn = vreinterpret_u8_u32(vtrn2_u32(p10_, p11_));
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_C1() \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(grays, g)
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_C3() \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(reds, r) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(greens, g) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(blues, b)
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_C4() \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(reds, r) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(greens, g) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(blues, b) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(alphas, a)
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8(CN) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN_NEON_U8_##CN()
#define CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_C1() \
CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \
CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(grays, g)
#define CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_C3() \
CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \
CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(reds, r) \
CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(greens, g) \
CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(blues, b)
#define CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_C4() \
CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_LOAD() \
CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(reds, r) \
CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(greens, g) \
CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(blues, b) \
CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_TRN(alphas, a)
#define CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8(CN) \
CV_WARP_VECTOR_LINEAR_SHUFFLE_ALLWITHIN_NEON_U8_##CN()
// Shuffle (not all pixels within image)
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC1() \
#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC1() \
v_store_low(dstptr + x, bval_v0);
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC3() \
#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC3() \
v_store_low(dstptr + x*3, bval_v0); \
v_store_low(dstptr + x*3 + uf, bval_v1); \
v_store_low(dstptr + x*3 + uf*2, bval_v2);
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC4() \
#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_8UC4() \
v_store_low(dstptr + x*4, bval_v0); \
v_store_low(dstptr + x*4 + uf, bval_v1); \
v_store_low(dstptr + x*4 + uf*2, bval_v2); \
v_store_low(dstptr + x*4 + uf*3, bval_v3);
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC1() \
#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC1() \
v_store(dstptr + x, bval_v0);
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC3() \
#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC3() \
v_store(dstptr + x*3, bval_v0); \
v_store(dstptr + x*3 + uf, bval_v1); \
v_store(dstptr + x*3 + uf*2, bval_v2);
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC4() \
#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_16UC4() \
v_store(dstptr + x*4, bval_v0); \
v_store(dstptr + x*4 + uf, bval_v1); \
v_store(dstptr + x*4 + uf*2, bval_v2); \
v_store(dstptr + x*4 + uf*3, bval_v3);
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC1() \
#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC1() \
v_store(dstptr + x, bval_v0_l); \
v_store(dstptr + x + vlanes_32, bval_v0_h);
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC3() \
#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC3() \
v_store(dstptr + x*3, bval_v0_l); \
v_store(dstptr + x*3 + vlanes_32, bval_v0_h); \
v_store(dstptr + x*3 + uf, bval_v1_l); \
v_store(dstptr + x*3 + uf + vlanes_32, bval_v1_h); \
v_store(dstptr + x*3 + uf*2, bval_v2_l); \
v_store(dstptr + x*3 + uf*2 + vlanes_32, bval_v2_h);
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC4() \
#define CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_32FC4() \
v_store(dstptr + x*4, bval_v0_l); \
v_store(dstptr + x*4 + vlanes_32, bval_v0_h); \
v_store(dstptr + x*4 + uf, bval_v1_l); \
@ -154,70 +173,83 @@
v_store(dstptr + x*4 + uf*3, bval_v3_l); \
v_store(dstptr + x*4 + uf*3 + vlanes_32, bval_v3_h);
#define CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_C1(dy, dx, pixbuf_ofs) \
#define CV_WARP_VECTOR_FETCH_PIXEL_C1(dy, dx, pixbuf_ofs0, pixbuf_ofs1) \
if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
size_t addr_i = addr[i] + dy*srcstep + dx; \
pixbuf[i + pixbuf_ofs] = src[addr_i]; \
pixbuf[i + pixbuf_ofs0] = src[addr_i]; \
} else if (border_type == BORDER_CONSTANT) { \
pixbuf[i + pixbuf_ofs] = bval[0]; \
pixbuf[i + pixbuf_ofs0] = bval[0]; \
} else if (border_type == BORDER_TRANSPARENT) { \
pixbuf[i + pixbuf_ofs] = dstptr[x + i]; \
pixbuf[i + pixbuf_ofs0] = dstptr[x + i]; \
} else { \
int ix_ = borderInterpolate_fast(ix + dx, srccols, border_type_x); \
int iy_ = borderInterpolate_fast(iy + dy, srcrows, border_type_y); \
size_t addr_i = iy_*srcstep + ix_; \
pixbuf[i + pixbuf_ofs] = src[addr_i]; \
pixbuf[i + pixbuf_ofs0] = src[addr_i]; \
}
#define CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_C3(dy, dx, pixbuf_ofs) \
#define CV_WARP_VECTOR_FETCH_PIXEL_C3(dy, dx, pixbuf_ofs0, pixbuf_ofs1) \
if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
size_t addr_i = addr[i] + dy*srcstep + dx*3; \
pixbuf[i + pixbuf_ofs] = src[addr_i]; \
pixbuf[i + pixbuf_ofs + uf*4] = src[addr_i+1]; \
pixbuf[i + pixbuf_ofs + uf*8] = src[addr_i+2]; \
pixbuf[i + pixbuf_ofs0] = src[addr_i]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = src[addr_i+1]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = src[addr_i+2]; \
} else if (border_type == BORDER_CONSTANT) { \
pixbuf[i + pixbuf_ofs] = bval[0]; \
pixbuf[i + pixbuf_ofs + uf*4] = bval[1]; \
pixbuf[i + pixbuf_ofs + uf*8] = bval[2]; \
pixbuf[i + pixbuf_ofs0] = bval[0]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = bval[1]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = bval[2]; \
} else if (border_type == BORDER_TRANSPARENT) { \
pixbuf[i + pixbuf_ofs] = dstptr[(x + i)*3]; \
pixbuf[i + pixbuf_ofs + uf*4] = dstptr[(x + i)*3 + 1]; \
pixbuf[i + pixbuf_ofs + uf*8] = dstptr[(x + i)*3 + 2]; \
pixbuf[i + pixbuf_ofs0] = dstptr[(x + i)*3]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = dstptr[(x + i)*3 + 1]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = dstptr[(x + i)*3 + 2]; \
} else { \
int ix_ = borderInterpolate_fast(ix + dx, srccols, border_type_x); \
int iy_ = borderInterpolate_fast(iy + dy, srcrows, border_type_y); \
size_t addr_i = iy_*srcstep + ix_*3; \
pixbuf[i + pixbuf_ofs] = src[addr_i]; \
pixbuf[i + pixbuf_ofs + uf*4] = src[addr_i+1]; \
pixbuf[i + pixbuf_ofs + uf*8] = src[addr_i+2]; \
pixbuf[i + pixbuf_ofs0] = src[addr_i]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = src[addr_i+1]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = src[addr_i+2]; \
}
#define CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_C4(dy, dx, pixbuf_ofs) \
#define CV_WARP_VECTOR_FETCH_PIXEL_C4(dy, dx, pixbuf_ofs0, pixbuf_ofs1) \
if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
size_t addr_i = addr[i] + dy*srcstep + dx*4; \
pixbuf[i + pixbuf_ofs] = src[addr_i]; \
pixbuf[i + pixbuf_ofs + uf*4] = src[addr_i+1]; \
pixbuf[i + pixbuf_ofs + uf*8] = src[addr_i+2]; \
pixbuf[i + pixbuf_ofs + uf*12] = src[addr_i+3]; \
pixbuf[i + pixbuf_ofs0] = src[addr_i]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = src[addr_i+1]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = src[addr_i+2]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*3] = src[addr_i+3]; \
} else if (border_type == BORDER_CONSTANT) { \
pixbuf[i + pixbuf_ofs] = bval[0]; \
pixbuf[i + pixbuf_ofs + uf*4] = bval[1]; \
pixbuf[i + pixbuf_ofs + uf*8] = bval[2]; \
pixbuf[i + pixbuf_ofs + uf*12] = bval[3]; \
pixbuf[i + pixbuf_ofs0] = bval[0]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = bval[1]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = bval[2]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*3] = bval[3]; \
} else if (border_type == BORDER_TRANSPARENT) { \
pixbuf[i + pixbuf_ofs] = dstptr[(x + i)*4]; \
pixbuf[i + pixbuf_ofs + uf*4] = dstptr[(x + i)*4 + 1]; \
pixbuf[i + pixbuf_ofs + uf*8] = dstptr[(x + i)*4 + 2]; \
pixbuf[i + pixbuf_ofs + uf*12] = dstptr[(x + i)*4 + 3]; \
pixbuf[i + pixbuf_ofs0] = dstptr[(x + i)*4]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = dstptr[(x + i)*4 + 1]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = dstptr[(x + i)*4 + 2]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*3] = dstptr[(x + i)*4 + 3]; \
} else { \
int ix_ = borderInterpolate_fast(ix + dx, srccols, border_type_x); \
int iy_ = borderInterpolate_fast(iy + dy, srcrows, border_type_y); \
size_t addr_i = iy_*srcstep + ix_*4; \
pixbuf[i + pixbuf_ofs] = src[addr_i]; \
pixbuf[i + pixbuf_ofs + uf*4] = src[addr_i+1]; \
pixbuf[i + pixbuf_ofs + uf*8] = src[addr_i+2]; \
pixbuf[i + pixbuf_ofs + uf*12] = src[addr_i+3]; \
pixbuf[i + pixbuf_ofs0] = src[addr_i]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1] = src[addr_i+1]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*2] = src[addr_i+2]; \
pixbuf[i + pixbuf_ofs0 + pixbuf_ofs1*3] = src[addr_i+3]; \
}
#define CV_WARP_NEAREST_VECTOR_FETCH_PIXEL_C1() \
CV_WARP_VECTOR_FETCH_PIXEL_C1(0, 0, 0, 1);
#define CV_WARP_NEAREST_VECTOR_FETCH_PIXEL_C3() \
CV_WARP_VECTOR_FETCH_PIXEL_C3(0, 0, 2*i, 1);
#define CV_WARP_NEAREST_VECTOR_FETCH_PIXEL_C4() \
CV_WARP_VECTOR_FETCH_PIXEL_C4(0, 0, 3*i, 1);
#define CV_WARP_NEAREST_VECTOR_FETCH_PIXEL(CN) \
CV_WARP_NEAREST_VECTOR_FETCH_PIXEL_##CN()
#define CV_WARP_LINEAR_VECTOR_FETCH_PIXEL(CN) \
CV_WARP_VECTOR_FETCH_PIXEL_##CN(0, 0, 0, uf*4); \
CV_WARP_VECTOR_FETCH_PIXEL_##CN(0, 1, uf, uf*4); \
CV_WARP_VECTOR_FETCH_PIXEL_##CN(1, 0, uf*2, uf*4); \
CV_WARP_VECTOR_FETCH_PIXEL_##CN(1, 1, uf*3, uf*4);
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(CN, DEPTH) \
#define CV_WARP_VECTOR_SHUFFLE_NOTALLWITHIN(INTER, CN, DEPTH) \
if (border_type == BORDER_CONSTANT || border_type == BORDER_TRANSPARENT) { \
mask_0 = v_lt(v_reinterpret_as_u32(v_add(src_ix0, one)), outer_scols); \
mask_1 = v_lt(v_reinterpret_as_u32(v_add(src_ix1, one)), outer_scols); \
@ -226,7 +258,7 @@
v_uint16 outer_mask = v_pack(mask_0, mask_1); \
if (v_reduce_max(outer_mask) == 0) { \
if (border_type == BORDER_CONSTANT) { \
CV_WARP_LINEAR_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_##DEPTH##CN() \
CV_WARP_VECTOR_SHUFFLE_STORE_CONSTANT_BORDER_##DEPTH##CN() \
} \
continue; \
} \
@ -237,111 +269,135 @@
vx_store(src_iy + vlanes_32, src_iy1); \
for (int i = 0; i < uf; i++) { \
int ix = src_ix[i], iy = src_iy[i]; \
CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_##CN(0, 0, 0); \
CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_##CN(0, 1, uf); \
CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_##CN(1, 0, uf*2); \
CV_WARP_LINEAR_VECTOR_FETCH_PIXEL_##CN(1, 1, uf*3); \
CV_WARP_##INTER##_VECTOR_FETCH_PIXEL(CN) \
}
// Shuffle (not all pixels within image) (ARM NEON)
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(cn, offset)\
#define CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(cn, offset)\
p00##cn = vld1_u8(pixbuf + offset); \
p01##cn = vld1_u8(pixbuf + offset + 8); \
p10##cn = vld1_u8(pixbuf + offset + 16); \
p11##cn = vld1_u8(pixbuf + offset + 24);
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_C1() \
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(g, 0)
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_C3() \
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(r, 0) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(g, 32) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(b, 64)
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_C4() \
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(r, 0) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(g, 32) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(b, 64) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(a, 96)
#define CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8(CN) \
CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN_NEON_U8_##CN()
#define CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_C1() \
CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(g, 0)
#define CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_C3() \
CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(r, 0) \
CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(g, 32) \
CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(b, 64)
#define CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_C4() \
CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(r, 0) \
CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(g, 32) \
CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(b, 64) \
CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_LOAD(a, 96)
#define CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8(CN) \
CV_WARP_VECTOR_LINEAR_SHUFFLE_NOTALLWITHIN_NEON_U8_##CN()
// Load pixels for linear interpolation (uint8_t -> int16_t)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(cn, i) \
v_int16 f00##cn = v_reinterpret_as_s16(vx_load_expand(pixbuf + uf * i)), \
f01##cn = v_reinterpret_as_s16(vx_load_expand(pixbuf + uf * (i+1))), \
f10##cn = v_reinterpret_as_s16(vx_load_expand(pixbuf + uf * (i+2))), \
f11##cn = v_reinterpret_as_s16(vx_load_expand(pixbuf + uf * (i+3)));
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_C1() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(g, 0)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_C3() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(r, 0) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(g, 4) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(b, 8)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_C4() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(r, 0) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(g, 4) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(b, 8) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16(a, 12)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16(CN) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_##CN();
// [New] Load pixels for interpolation
#define CV_WARP_VECTOR_NEAREST_LOAD_CN_8U_16U(cn, i) \
v_uint16 f00##cn = vx_load_expand(pixbuf + uf * i);
#define CV_WARP_VECTOR_NEAREST_LOAD_CN_16U_16U(cn, i) \
v_uint16 f00##cn = vx_load(pixbuf + uf * i);
#define CV_WARP_VECTOR_NEAREST_LOAD_CN_32F_32F(cn, i) \
v_float32 f00##cn##l = vx_load(pixbuf + uf * i); \
v_float32 f00##cn##h = vx_load(pixbuf + uf * i + vlanes_32);
#define CV_WARP_VECTOR_LINEAR_LOAD_CN_8U_16U(cn, i) \
v_uint16 f00##cn = vx_load_expand(pixbuf + uf * 4*i), \
f01##cn = vx_load_expand(pixbuf + uf * (4*i+1)), \
f10##cn = vx_load_expand(pixbuf + uf * (4*i+2)), \
f11##cn = vx_load_expand(pixbuf + uf * (4*i+3));
#define CV_WARP_VECTOR_LINEAR_LOAD_CN_16U_16U(cn, i) \
v_uint16 f00##cn = vx_load(pixbuf + uf * 4*i), \
f01##cn = vx_load(pixbuf + uf * (4*i+1)), \
f10##cn = vx_load(pixbuf + uf * (4*i+2)), \
f11##cn = vx_load(pixbuf + uf * (4*i+3));
#define CV_WARP_VECTOR_LINEAR_LOAD_CN_32F_32F(cn, i) \
v_float32 f00##cn##l = vx_load(pixbuf + uf * 4*i), \
f00##cn##h = vx_load(pixbuf + uf * 4*i + vlanes_32); \
v_float32 f01##cn##l = vx_load(pixbuf + uf * (4*i+1)), \
f01##cn##h = vx_load(pixbuf + uf * (4*i+1) + vlanes_32); \
v_float32 f10##cn##l = vx_load(pixbuf + uf * (4*i+2)), \
f10##cn##h = vx_load(pixbuf + uf * (4*i+2) + vlanes_32); \
v_float32 f11##cn##l = vx_load(pixbuf + uf * (4*i+3)), \
f11##cn##h = vx_load(pixbuf + uf * (4*i+3) + vlanes_32);
#define CV_WARP_VECTOR_INTER_LOAD_C1(INTER, SDEPTH, DDEPTH) \
CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(g, 0)
#define CV_WARP_VECTOR_INTER_LOAD_C3(INTER, SDEPTH, DDEPTH) \
CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(r, 0) \
CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(g, 1) \
CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(b, 2)
#define CV_WARP_VECTOR_INTER_LOAD_C4(INTER, SDEPTH, DDEPTH) \
CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(r, 0) \
CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(g, 1) \
CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(b, 2) \
CV_WARP_VECTOR_##INTER##_LOAD_CN_##SDEPTH##_##DDEPTH(a, 3)
#define CV_WARP_VECTOR_INTER_LOAD(INTER, CN, SDEPTH, DDEPTH) \
CV_WARP_VECTOR_INTER_LOAD_##CN(INTER, SDEPTH, DDEPTH)
// Load pixels for linear interpolation (uint8_t -> int16_t) (ARM NEON)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(cn) \
v_int16 f00##cn = v_reinterpret_as_s16(v_uint16(vmovl_u8(p00##cn))), \
f01##cn = v_reinterpret_as_s16(v_uint16(vmovl_u8(p01##cn))), \
f10##cn = v_reinterpret_as_s16(v_uint16(vmovl_u8(p10##cn))), \
f11##cn = v_reinterpret_as_s16(v_uint16(vmovl_u8(p11##cn)));
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON_C1() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(g)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON_C3() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(r) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(g) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(b)
// [New] Store
#define CV_WARP_VECTOR_NEAREST_STORE_C1_16U_8U() \
v_pack_store(dstptr + x, f00g);
#define CV_WARP_VECTOR_NEAREST_STORE_C3_16U_8U() \
v_pack_store(dstptr + 3*x, f00r); \
v_pack_store(dstptr + 3*x + uf, f00g); \
v_pack_store(dstptr + 3*x + uf*2, f00b);
#define CV_WARP_VECTOR_NEAREST_STORE_C4_16U_8U() \
v_pack_store(dstptr + 4*x, f00r); \
v_pack_store(dstptr + 4*x + uf, f00g); \
v_pack_store(dstptr + 4*x + uf*2, f00b); \
v_pack_store(dstptr + 4*x + uf*3, f00a);
#define CV_WARP_VECTOR_NEAREST_STORE_C1_16U_16U() \
vx_store(dstptr + x, f00g);
#define CV_WARP_VECTOR_NEAREST_STORE_C3_16U_16U() \
vx_store(dstptr + 3*x, f00r); \
vx_store(dstptr + 3*x + uf, f00g); \
vx_store(dstptr + 3*x + uf*2, f00b);
#define CV_WARP_VECTOR_NEAREST_STORE_C4_16U_16U() \
vx_store(dstptr + 4*x, f00r); \
vx_store(dstptr + 4*x + uf, f00g); \
vx_store(dstptr + 4*x + uf*2, f00b); \
vx_store(dstptr + 4*x + uf*3, f00a);
#define CV_WARP_VECTOR_NEAREST_STORE_C1_32F_32F() \
vx_store(dstptr + x, f00gl); \
vx_store(dstptr + x + vlanes_32, f00gh);
#define CV_WARP_VECTOR_NEAREST_STORE_C3_32F_32F() \
vx_store(dstptr + 3*x, f00rl); \
vx_store(dstptr + 3*x + vlanes_32, f00rh); \
vx_store(dstptr + 3*x + uf, f00gl); \
vx_store(dstptr + 3*x + uf + vlanes_32, f00gh); \
vx_store(dstptr + 3*x + uf*2, f00bl); \
vx_store(dstptr + 3*x + uf*2 + vlanes_32, f00bh);
#define CV_WARP_VECTOR_NEAREST_STORE_C4_32F_32F() \
vx_store(dstptr + 4*x, f00rl); \
vx_store(dstptr + 4*x + vlanes_32, f00rh); \
vx_store(dstptr + 4*x + uf, f00gl); \
vx_store(dstptr + 4*x + uf + vlanes_32, f00gh); \
vx_store(dstptr + 4*x + uf*2, f00bl); \
vx_store(dstptr + 4*x + uf*2 + vlanes_32, f00bh); \
vx_store(dstptr + 4*x + uf*3, f00al); \
vx_store(dstptr + 4*x + uf*3 + vlanes_32, f00ah);
#define CV_WARP_VECTOR_INTER_STORE(INTER, CN, SDEPTH, DDEPTH) \
CV_WARP_VECTOR_##INTER##_STORE_##CN##_##SDEPTH##_##DDEPTH()
// Load pixels for linear interpolation (uint8_t -> uint16_t) (ARM NEON)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(cn) \
v_uint16 f00##cn = v_uint16(vmovl_u8(p00##cn)), \
f01##cn = v_uint16(vmovl_u8(p01##cn)), \
f10##cn = v_uint16(vmovl_u8(p10##cn)), \
f11##cn = v_uint16(vmovl_u8(p11##cn));
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON_C1() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(g)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON_C3() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(r) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(g) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(b)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON_C4() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(r) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(g) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(b) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8S16_NEON(a)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON(CN) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8S16_NEON_##CN();
// Load pixels for linear interpolation (uint16_t -> uint16_t)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(cn, i) \
v_uint16 f00##cn = vx_load(pixbuf + uf * i), \
f01##cn = vx_load(pixbuf + uf * (i+1)), \
f10##cn = vx_load(pixbuf + uf * (i+2)), \
f11##cn = vx_load(pixbuf + uf * (i+3));
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16_C1() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(g, 0)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16_C3() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(r, 0) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(g, 4) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(b, 8)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16_C4() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(r, 0) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(g, 4) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(b, 8) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U16(a, 12)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(CN) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16_##CN();
// Load pixels for linear interpolation (int16_t -> float)
#define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(cn) \
v_float32 f00##cn##l = v_cvt_f32(v_expand_low(f00##cn)), f00##cn##h = v_cvt_f32(v_expand_high(f00##cn)), \
f01##cn##l = v_cvt_f32(v_expand_low(f01##cn)), f01##cn##h = v_cvt_f32(v_expand_high(f01##cn)), \
f10##cn##l = v_cvt_f32(v_expand_low(f10##cn)), f10##cn##h = v_cvt_f32(v_expand_high(f10##cn)), \
f11##cn##l = v_cvt_f32(v_expand_low(f11##cn)), f11##cn##h = v_cvt_f32(v_expand_high(f11##cn));
#define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32_C1() \
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(g)
#define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32_C3() \
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(r) \
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(g) \
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(b)
#define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32_C4() \
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(r) \
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(g) \
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(b) \
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_S16F32(a)
#define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32(CN) \
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_S16F32_##CN()
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(r) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(g) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(b) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8U16_NEON(a)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON(CN) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_U8U16_NEON_##CN();
// Load pixels for linear interpolation (uint16_t -> float)
#define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_CN_U16F32(cn) \
@ -363,26 +419,6 @@
#define CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(CN) \
CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32_##CN()
// Load pixels for linear interpolation (float -> float)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(cn, i) \
v_float32 f00##cn##l = vx_load(pixbuf + uf * i), f00##cn##h = vx_load(pixbuf + uf * i + vlanes_32), \
f01##cn##l = vx_load(pixbuf + uf * (i+1)), f01##cn##h = vx_load(pixbuf + uf * (i+1) + vlanes_32), \
f10##cn##l = vx_load(pixbuf + uf * (i+2)), f10##cn##h = vx_load(pixbuf + uf * (i+2) + vlanes_32), \
f11##cn##l = vx_load(pixbuf + uf * (i+3)), f11##cn##h = vx_load(pixbuf + uf * (i+3) + vlanes_32);
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32_C1() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(g, 0)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32_C3() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(r, 0) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(g, 4) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(b, 8)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32_C4() \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(r, 0) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(g, 4) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(b, 8) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_F32(a, 12)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(CN) \
CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32_##CN()
// Load pixels for linear interpolation (uint8_t -> float16)
#define CV_WARP_LINEAR_VECTOR_INTER_LOAD_CN_U8F16(cn) \
v_float16 f00##cn = v_float16(vcvtq_f16_u16(vmovl_u8(p00##cn))), \
@ -556,9 +592,30 @@
#define CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8(CN) \
CV_WARP_LINEAR_VECTOR_INTER_STORE_F16U8_##CN()
// Special case for C4 load, shuffle and bilinear interpolation
#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(ofs) \
// Special case for C4 shuffle, interpolation and store
// SIMD128, nearest
#define CV_WARP_SIMD128_NEAREST_SHUFFLE_INTER_8UC4_I(ofs) \
const uint8_t *srcptr##ofs = src + addr[i+ofs]; \
v_uint32 i##ofs##_pix0 = vx_load_expand_q(srcptr##ofs);
#define CV_WARP_SIMD128_NEAREST_SHUFFLE_INTER_16UC4_I(ofs) \
const uint16_t *srcptr##ofs = src + addr[i+ofs]; \
v_uint32 i##ofs##_pix0 = vx_load_expand(srcptr##ofs);
#define CV_WARP_SIMD128_NEAREST_SHUFFLE_INTER_32FC4_I(ofs) \
const float *srcptr##ofs = src + addr[i+ofs]; \
v_float32 i##ofs##_pix0 = vx_load(srcptr##ofs);
#define CV_WARP_SIMD128_NEAREST_STORE_8UC4_I() \
v_pack_store(dstptr + 4*(x+i), v_pack(i0_pix0, i1_pix0)); \
v_pack_store(dstptr + 4*(x+i+2), v_pack(i2_pix0, i3_pix0));
#define CV_WARP_SIMD128_NEAREST_STORE_16UC4_I() \
vx_store(dstptr + 4*(x+i), v_pack(i0_pix0, i1_pix0)); \
vx_store(dstptr + 4*(x+i+2), v_pack(i2_pix0, i3_pix0));
#define CV_WARP_SIMD128_NEAREST_STORE_32FC4_I() \
vx_store(dstptr + 4*(x+i), i0_pix0); \
vx_store(dstptr + 4*(x+i+1), i1_pix0); \
vx_store(dstptr + 4*(x+i+2), i2_pix0); \
vx_store(dstptr + 4*(x+i+3), i3_pix0);
// SIMD128, bilinear
#define CV_WARP_SIMD128_LINEAR_SHUFFLE_INTER_8UC4_I(ofs) \
const uint8_t *srcptr##ofs = src + addr[i+ofs]; \
v_float32 i##ofs##_pix0 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(srcptr##ofs))); \
v_float32 i##ofs##_pix1 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(srcptr##ofs+4))); \
@ -569,7 +626,7 @@
i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \
i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \
i##ofs##_pix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0);
#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_16UC4_I(ofs) \
#define CV_WARP_SIMD128_LINEAR_SHUFFLE_INTER_16UC4_I(ofs) \
const uint16_t *srcptr##ofs = src + addr[i+ofs]; \
v_float32 i##ofs##_pix0 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs))); \
v_float32 i##ofs##_pix1 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs+4))); \
@ -580,7 +637,7 @@
i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \
i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \
i##ofs##_pix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0);
#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_32FC4_I(ofs) \
#define CV_WARP_SIMD128_LINEAR_SHUFFLE_INTER_32FC4_I(ofs) \
const float *srcptr##ofs = src + addr[i+ofs]; \
v_float32 i##ofs##_pix0 = vx_load(srcptr##ofs); \
v_float32 i##ofs##_pix1 = vx_load(srcptr##ofs+4); \
@ -591,30 +648,59 @@
i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \
i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \
i##ofs##_pix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0);
#define CV_WARP_SIMD128_STORE_8UC4_I() \
#define CV_WARP_SIMD128_LINEAR_STORE_8UC4_I() \
v_uint16 i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)); \
v_uint16 i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \
v_pack_store(dstptr + 4*(x+i), i01_pix); \
v_pack_store(dstptr + 4*(x+i+2), i23_pix);
#define CV_WARP_SIMD128_STORE_16UC4_I() \
#define CV_WARP_SIMD128_LINEAR_STORE_16UC4_I() \
v_uint16 i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)); \
v_uint16 i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \
vx_store(dstptr + 4*(x+i), i01_pix); \
vx_store(dstptr + 4*(x+i+2), i23_pix);
#define CV_WARP_SIMD128_STORE_32FC4_I() \
#define CV_WARP_SIMD128_LINEAR_STORE_32FC4_I() \
vx_store(dstptr + 4*(x+i), i0_pix0); \
vx_store(dstptr + 4*(x+i)+4, i1_pix0); \
vx_store(dstptr + 4*(x+i)+8, i2_pix0); \
vx_store(dstptr + 4*(x+i)+12, i3_pix0);
#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(DEPTH) \
vx_store(dstptr + 4*(x+i+1), i1_pix0); \
vx_store(dstptr + 4*(x+i+2), i2_pix0); \
vx_store(dstptr + 4*(x+i+3), i3_pix0);
#define CV_WARP_SIMD128_SHUFFLE_INTER_STORE_C4(INTER, DEPTH) \
for (int i = 0; i < uf; i+=vlanes_32) { \
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0); \
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(1); \
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2); \
CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(3); \
CV_WARP_SIMD128_STORE_##DEPTH##C4_I(); \
CV_WARP_SIMD128_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(0); \
CV_WARP_SIMD128_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(1); \
CV_WARP_SIMD128_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(2); \
CV_WARP_SIMD128_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(3); \
CV_WARP_SIMD128_##INTER##_STORE_##DEPTH##C4_I(); \
}
#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(ofs0, ofs1) \
// SIMD128, nearest
#define CV_WARP_SIMD256_NEAREST_SHUFFLE_INTER_8UC4_I(ofs0, ofs1) \
const uint8_t *srcptr##ofs0 = src + addr[i+ofs0]; \
const uint8_t *srcptr##ofs1 = src + addr[i+ofs1]; \
v_uint32 i##ofs0##_pix0x = v256_load_expand_q(srcptr##ofs0); \
v_uint32 i##ofs1##_pix0x = v256_load_expand_q(srcptr##ofs1); \
v_uint32 i##ofs0##ofs1##_pix00 = v_combine_low(i##ofs0##_pix0x, i##ofs1##_pix0x);
#define CV_WARP_SIMD256_NEAREST_SHUFFLE_INTER_16UC4_I(ofs0, ofs1) \
const uint16_t *srcptr##ofs0 = src + addr[i+ofs0]; \
const uint16_t *srcptr##ofs1 = src + addr[i+ofs1]; \
v_uint32 i##ofs0##_pix0x = v256_load_expand(srcptr##ofs0); \
v_uint32 i##ofs1##_pix0x = v256_load_expand(srcptr##ofs1); \
v_uint32 i##ofs0##ofs1##_pix00 = v_combine_low(i##ofs0##_pix0x, i##ofs1##_pix0x);
#define CV_WARP_SIMD256_NEAREST_SHUFFLE_INTER_32FC4_I(ofs0, ofs1) \
const float *srcptr##ofs0 = src + addr[i+ofs0]; \
const float *srcptr##ofs1 = src + addr[i+ofs1]; \
v_float32 i##ofs0##ofs1##_fpix00 = vx_load_halves(srcptr##ofs0, srcptr##ofs1);
#define CV_WARP_SIMD256_NEAREST_STORE_8UC4_I() \
v_pack_store(dstptr + 4*(x+i), v_pack(i01_pix00, i23_pix00)); \
v_pack_store(dstptr + 4*(x+i+4), v_pack(i45_pix00, i67_pix00));
#define CV_WARP_SIMD256_NEAREST_STORE_16UC4_I() \
vx_store(dstptr + 4*(x+i), v_pack(i01_pix00, i23_pix00)); \
vx_store(dstptr + 4*(x+i+4), v_pack(i45_pix00, i67_pix00));
#define CV_WARP_SIMD256_NEAREST_STORE_32FC4_I() \
vx_store(dstptr + 4*(x+i), i01_fpix00); \
vx_store(dstptr + 4*(x+i)+8, i23_fpix00); \
vx_store(dstptr + 4*(x+i)+16, i45_fpix00); \
vx_store(dstptr + 4*(x+i)+24, i67_fpix00);
// SIMD256, bilinear
#define CV_WARP_SIMD256_LINEAR_SHUFFLE_INTER_8UC4_I(ofs0, ofs1) \
const uint8_t *srcptr##ofs0 = src + addr[i+ofs0]; \
const uint8_t *srcptr##ofs1 = src + addr[i+ofs1]; \
v_int32 i##ofs0##_pix01 = v_reinterpret_as_s32(v256_load_expand_q(srcptr##ofs0)), \
@ -635,8 +721,9 @@
i##ofs0##ofs1##_beta = v_combine_low(i##ofs0##_beta, i##ofs1##_beta); \
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \
i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00);
#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_16UC4_I(ofs0, ofs1) \
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \
auto i##ofs0##ofs1##_pix00 = v_round(i##ofs0##ofs1##_fpix00);
#define CV_WARP_SIMD256_LINEAR_SHUFFLE_INTER_16UC4_I(ofs0, ofs1) \
const uint16_t *srcptr##ofs0 = src + addr[i+ofs0]; \
const uint16_t *srcptr##ofs1 = src + addr[i+ofs1]; \
v_int32 i##ofs0##_pix01 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs0)), \
@ -657,8 +744,9 @@
i##ofs0##ofs1##_beta = v_combine_low(i##ofs0##_beta, i##ofs1##_beta); \
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \
i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00);
#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_32FC4_I(ofs0, ofs1) \
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \
auto i##ofs0##ofs1##_pix00 = v_round(i##ofs0##ofs1##_fpix00);
#define CV_WARP_SIMD256_LINEAR_SHUFFLE_INTER_32FC4_I(ofs0, ofs1) \
const float *srcptr##ofs0 = src + addr[i+ofs0]; \
const float *srcptr##ofs1 = src + addr[i+ofs1]; \
v_float32 i##ofs0##_fpix01 = v256_load(srcptr##ofs0), \
@ -678,30 +766,48 @@
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \
i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \
i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta, v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00);
#define CV_WARP_SIMD256_STORE_8UC4_I() \
auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \
v_pack_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \
auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \
v_pack_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix));
#define CV_WARP_SIMD256_STORE_16UC4_I() \
auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \
vx_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \
auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \
vx_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix));
#define CV_WARP_SIMD256_STORE_32FC4_I() \
#define CV_WARP_SIMD256_LINEAR_STORE_8UC4_I() \
v_pack_store(dstptr + 4*(x+i), v_pack_u(i01_pix00, i23_pix00)); \
v_pack_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix00, i67_pix00));
#define CV_WARP_SIMD256_LINEAR_STORE_16UC4_I() \
vx_store(dstptr + 4*(x+i), v_pack_u(i01_pix00, i23_pix00)); \
vx_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix00, i67_pix00));
#define CV_WARP_SIMD256_LINEAR_STORE_32FC4_I() \
vx_store(dstptr + 4*(x+i), i01_fpix00); \
vx_store(dstptr + 4*(x+i)+8, i23_fpix00); \
vx_store(dstptr + 4*(x+i)+16, i45_fpix00); \
vx_store(dstptr + 4*(x+i)+24, i67_fpix00);
#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(DEPTH) \
#define CV_WARP_SIMD256_SHUFFLE_INTER_STORE_C4(INTER, DEPTH) \
for (int i = 0; i < uf; i+=vlanes_32) { \
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0, 1); \
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2, 3); \
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(4, 5); \
CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(6, 7); \
CV_WARP_SIMD256_STORE_##DEPTH##C4_I(); \
CV_WARP_SIMD256_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(0, 1) \
CV_WARP_SIMD256_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(2, 3) \
CV_WARP_SIMD256_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(4, 5) \
CV_WARP_SIMD256_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(6, 7) \
CV_WARP_SIMD256_##INTER##_STORE_##DEPTH##C4_I() \
}
#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(ofs) \
// SIMD_SCALABLE (SIMDX), nearest
#define CV_WARP_SIMDX_NEAREST_SHUFFLE_INTER_8UC4_I(ofs) \
const uint8_t *srcptr##ofs = src + addr[i+ofs]; \
v_uint32 i##ofs##_pix0 = v_load_expand_q<4>(srcptr##ofs);
#define CV_WARP_SIMDX_NEAREST_SHUFFLE_INTER_16UC4_I(ofs) \
const uint16_t *srcptr##ofs = src + addr[i+ofs]; \
v_uint32 i##ofs##_pix0 = v_load_expand<4>(srcptr##ofs);
#define CV_WARP_SIMDX_NEAREST_SHUFFLE_INTER_32FC4_I(ofs) \
const float *srcptr##ofs = src + addr[i+ofs]; \
v_float32 i##ofs##_fpix0 = v_load<4>(srcptr##ofs);
#define CV_WARP_SIMDX_NEAREST_STORE_8UC4_I() \
v_pack_store<8>(dstptr + 4*(x+i), v_pack<4>(i0_pix0, i1_pix0)); \
v_pack_store<8>(dstptr + 4*(x+i+2), v_pack<4>(i2_pix0, i3_pix0));
#define CV_WARP_SIMDX_NEAREST_STORE_16UC4_I() \
v_store<8>(dstptr + 4*(x+i), v_pack<4>(i0_pix0, i1_pix0)); \
v_store<8>(dstptr + 4*(x+i+2), v_pack<4>(i2_pix0, i3_pix0));
#define CV_WARP_SIMDX_NEAREST_STORE_32FC4_I() \
v_store<4>(dstptr + 4*(x+i), i0_fpix0); \
v_store<4>(dstptr + 4*(x+i)+4, i1_fpix0); \
v_store<4>(dstptr + 4*(x+i)+8, i2_fpix0); \
v_store<4>(dstptr + 4*(x+i)+12, i3_fpix0);
// SIMD_SCALABLE (SIMDX), bilinear
#define CV_WARP_SIMDX_LINEAR_SHUFFLE_INTER_8UC4_I(ofs) \
const uint8_t *srcptr##ofs = src + addr[i+ofs]; \
v_float32 i##ofs##_fpix0 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q<4>(srcptr##ofs))), \
i##ofs##_fpix1 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q<4>(srcptr##ofs+4))), \
@ -711,8 +817,9 @@
i##ofs##_beta = vx_setall_f32(vbeta[i+ofs]); \
i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \
i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \
i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0);
#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_16UC4_I(ofs) \
i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0); \
auto i##ofs##_pix0 = v_round(i##ofs##_fpix0);
#define CV_WARP_SIMDX_LINEAR_SHUFFLE_INTER_16UC4_I(ofs) \
const uint16_t *srcptr##ofs = src + addr[i+ofs]; \
v_float32 i##ofs##_fpix0 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs))), \
i##ofs##_fpix1 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs+4))), \
@ -722,8 +829,9 @@
i##ofs##_beta = vx_setall_f32(vbeta[i+ofs]); \
i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \
i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \
i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0);
#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_32FC4_I(ofs) \
i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0); \
auto i##ofs##_pix0 = v_round(i##ofs##_fpix0);
#define CV_WARP_SIMDX_LINEAR_SHUFFLE_INTER_32FC4_I(ofs) \
const float *srcptr##ofs = src + addr[i+ofs]; \
v_float32 i##ofs##_fpix0 = v_load<4>(srcptr##ofs), \
i##ofs##_fpix1 = v_load<4>(srcptr##ofs+4), \
@ -734,26 +842,25 @@
i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \
i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \
i##ofs##_fpix0 = v_fma(i##ofs##_beta, v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0);
#define CV_WARP_SIMDX_STORE_8UC4_I() \
auto i01_pix = v_pack_u<4>(v_round(i0_fpix0), v_round(i1_fpix0)), \
i23_pix = v_pack_u<4>(v_round(i2_fpix0), v_round(i3_fpix0)); \
v_pack_store<8>(dstptr + 4*(x+i), i01_pix); \
v_pack_store<8>(dstptr + 4*(x+i+2), i23_pix);
#define CV_WARP_SIMDX_STORE_16UC4_I() \
auto i01_pix = v_pack_u<4>(v_round(i0_fpix0), v_round(i1_fpix0)), \
i23_pix = v_pack_u<4>(v_round(i2_fpix0), v_round(i3_fpix0)); \
v_store<8>(dstptr + 4*(x+i), i01_pix); \
v_store<8>(dstptr + 4*(x+i+2), i23_pix);
#define CV_WARP_SIMDX_STORE_32FC4_I() \
#define CV_WARP_SIMDX_LINEAR_STORE_8UC4_I() \
v_pack_store<8>(dstptr + 4*(x+i), v_pack_u<4>(i0_pix0, i1_pix0)); \
v_pack_store<8>(dstptr + 4*(x+i+2), v_pack_u<4>(i2_pix0, i3_pix0));
#define CV_WARP_SIMDX_LINEAR_STORE_16UC4_I() \
v_store<8>(dstptr + 4*(x+i), v_pack_u<4>(i0_pix0, i1_pix0)); \
v_store<8>(dstptr + 4*(x+i+2), v_pack_u<4>(i2_pix0, i3_pix0));
#define CV_WARP_SIMDX_LINEAR_STORE_32FC4_I() \
v_store<4>(dstptr + 4*(x+i), i0_fpix0); \
v_store<4>(dstptr + 4*(x+i)+4, i1_fpix0); \
v_store<4>(dstptr + 4*(x+i)+8, i2_fpix0); \
v_store<4>(dstptr + 4*(x+i)+12, i3_fpix0);
#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(DEPTH) \
#define CV_WARP_SIMDX_SHUFFLE_INTER_STORE_C4(INTER, DEPTH) \
for (int i = 0; i < uf; i+=4) { \
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0); \
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(1); \
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2); \
CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(3); \
CV_WARP_SIMDX_STORE_##DEPTH##C4_I(); \
CV_WARP_SIMDX_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(0); \
CV_WARP_SIMDX_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(1); \
CV_WARP_SIMDX_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(2); \
CV_WARP_SIMDX_##INTER##_SHUFFLE_INTER_##DEPTH##C4_I(3); \
CV_WARP_SIMDX_##INTER##_STORE_##DEPTH##C4_I(); \
}
#define CV_WARP_VECTOR_SHUFFLE_INTER_STORE_C4(SIMD, INTER, DEPTH) \
CV_WARP_##SIMD##_SHUFFLE_INTER_STORE_C4(INTER, DEPTH)

File diff suppressed because it is too large Load Diff

View File

@ -703,6 +703,16 @@ protected:
virtual void run_func();
virtual void run_reference_func();
template<typename T>
void new_nearest_c1(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep,
const T *bval, int borderType_x, int borderType_y);
template<typename T>
void new_nearest_c3(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep,
const T *bval, int borderType_x, int borderType_y);
template<typename T>
void new_nearest_c4(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep,
const T *bval, int borderType_x, int borderType_y);
template<typename T>
void new_linear_c1(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep,
const T *bval, int borderType_x, int borderType_y);
@ -720,7 +730,7 @@ protected:
remap_func funcs[2];
private:
template <typename T> void new_remap(const Mat&, Mat&);
template <typename T> void new_remap(const Mat&, Mat&, int);
void remap_nearest(const Mat&, Mat&);
void remap_generic(const Mat&, Mat&);
@ -879,19 +889,19 @@ void CV_Remap_Test::run_reference_func()
if (interpolation == INTER_AREA)
interpolation = INTER_LINEAR;
if (interpolation == INTER_LINEAR && mapx.depth() == CV_32F) {
if ((interpolation == INTER_LINEAR) && mapx.depth() == CV_32F) {
int src_depth = src.depth(), src_channels = src.channels();
Mat tmp = Mat::zeros(dst.size(), dst.type());
if (src_depth == CV_8U && (src_channels == 1 || src_channels == 3 || src_channels == 4)) {
new_remap<uint8_t>(src, tmp);
new_remap<uint8_t>(src, tmp, interpolation);
tmp.convertTo(reference_dst, reference_dst.depth());
return;
} else if (src_depth == CV_16U && (src_channels == 1 || src_channels == 3 || src_channels == 4)) {
new_remap<uint16_t>(src, tmp);
new_remap<uint16_t>(src, tmp, interpolation);
tmp.convertTo(reference_dst, reference_dst.depth());
return;
} else if (src_depth == CV_32F && (src_channels == 1 || src_channels == 3 || src_channels == 4)) {
new_remap<float>(src, tmp);
new_remap<float>(src, tmp, interpolation);
tmp.convertTo(reference_dst, reference_dst.depth());
return;
}
@ -903,7 +913,7 @@ void CV_Remap_Test::run_reference_func()
(this->*funcs[index])(src, reference_dst);
}
#define FETCH_PIXEL_SCALAR(cn, dy, dx) \
#define WARP_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn, dy, dx) \
if ((((unsigned)(ix + dx) < (unsigned)srccols) & ((unsigned)(iy + dy) < (unsigned)srcrows)) != 0) { \
size_t ofs = dy*srcstep + dx*cn; \
for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = srcptr[ofs+ci];} \
@ -917,16 +927,28 @@ void CV_Remap_Test::run_reference_func()
size_t glob_ofs = iy_*srcstep + ix_*cn; \
for (int ci = 0; ci < cn; ci++) { pxy[2*dy*cn+dx*cn+ci] = srcptr_[glob_ofs+ci];} \
}
#define WARPAFFINE_SHUFFLE(cn) \
if ((((unsigned)ix < (unsigned)(srccols-1)) & \
((unsigned)iy < (unsigned)(srcrows-1))) != 0) { \
#define WARP_NEAREST_SHUFFLE_FETCH_PIXEL_IN_RANGE(cn) \
for (int ci = 0; ci < cn; ci++) { \
pxy[ci] = srcptr[ci]; \
}
#define WARP_LINEAR_SHUFFLE_FETCH_PIXEL_IN_RANGE(cn) \
for (int ci = 0; ci < cn; ci++) { \
pxy[ci] = srcptr[ci]; \
pxy[ci+cn] = srcptr[ci+cn]; \
pxy[ci+cn*2] = srcptr[srcstep+ci]; \
pxy[ci+cn*3] = srcptr[srcstep+ci+cn]; \
} \
}
#define WARP_NEAREST_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn) \
WARP_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn, 0, 0);
#define WARP_LINEAR_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn) \
WARP_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn, 0, 0); \
WARP_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn, 0, 1); \
WARP_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn, 1, 0); \
WARP_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn, 1, 1);
#define WARP_SHUFFLE(inter, cn) \
if ((((unsigned)ix < (unsigned)(srccols-1)) & \
((unsigned)iy < (unsigned)(srcrows-1))) != 0) { \
WARP_##inter##_SHUFFLE_FETCH_PIXEL_IN_RANGE(cn) \
} else { \
if ((borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT) && \
(((unsigned)(ix+1) >= (unsigned)(srccols+1))| \
@ -936,14 +958,50 @@ void CV_Remap_Test::run_reference_func()
} \
return; \
} \
FETCH_PIXEL_SCALAR(cn, 0, 0); \
FETCH_PIXEL_SCALAR(cn, 0, 1); \
FETCH_PIXEL_SCALAR(cn, 1, 0); \
FETCH_PIXEL_SCALAR(cn, 1, 1); \
WARP_##inter##_SHUFFLE_FETCH_PIXEL_OUT_RANGE(cn) \
}
template<typename T>
static inline void warpaffine_linear_calc(int cn, const T *pxy, T *dst, float sx, float sy)
void CV_Remap_Test::new_nearest_c1(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep,
const T *bval, int borderType_x, int borderType_y) {
int ix = (int)roundf(sx), iy = (int)roundf(sy);
T pxy[1];
const T *srcptr = srcptr_ + srcstep*iy + ix;
WARP_SHUFFLE(NEAREST, 1);
dstptr[x+0] = saturate_cast<T>(pxy[0]);
}
template<typename T>
void CV_Remap_Test::new_nearest_c3(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep,
const T *bval, int borderType_x, int borderType_y) {
int ix = (int)roundf(sx), iy = (int)roundf(sy);
T pxy[3];
const T *srcptr = srcptr_ + srcstep*iy + ix*3;
WARP_SHUFFLE(NEAREST, 3);
dstptr[x*3+0] = saturate_cast<T>(pxy[0]);
dstptr[x*3+1] = saturate_cast<T>(pxy[1]);
dstptr[x*3+2] = saturate_cast<T>(pxy[2]);
}
template<typename T>
void CV_Remap_Test::new_nearest_c4(int x, float sx, float sy, const T *srcptr_, T *dstptr, int srccols, int srcrows, size_t srcstep,
const T *bval, int borderType_x, int borderType_y) {
int ix = (int)roundf(sx), iy = (int)roundf(sy);
T pxy[4];
const T *srcptr = srcptr_ + srcstep*iy + ix*4;
WARP_SHUFFLE(NEAREST, 4);
dstptr[x*4+0] = saturate_cast<T>(pxy[0]);
dstptr[x*4+1] = saturate_cast<T>(pxy[1]);
dstptr[x*4+2] = saturate_cast<T>(pxy[2]);
dstptr[x*4+3] = saturate_cast<T>(pxy[3]);
}
template<typename T>
static inline void warp_linear_calc(int cn, const T *pxy, T *dst, float sx, float sy)
{
for (int ci = 0; ci < cn; ci++) {
float p00 = pxy[ci];
@ -956,7 +1014,6 @@ static inline void warpaffine_linear_calc(int cn, const T *pxy, T *dst, float sx
dst[ci] = saturate_cast<T>(v0);
}
}
template<typename T>
void CV_Remap_Test::new_linear_c1(int x, float sx, float sy, const T *srcptr_, T *dstptr,
int srccols, int srcrows, size_t srcstep,
@ -968,11 +1025,10 @@ void CV_Remap_Test::new_linear_c1(int x, float sx, float sy, const T *srcptr_, T
T pxy[4];
const T *srcptr = srcptr_ + srcstep*iy + ix;
WARPAFFINE_SHUFFLE(1);
WARP_SHUFFLE(LINEAR, 1);
warpaffine_linear_calc(1, pxy, dstptr+x, sx, sy);
warp_linear_calc(1, pxy, dstptr+x, sx, sy);
}
template<typename T>
void CV_Remap_Test::new_linear_c3(int x, float sx, float sy, const T *srcptr_, T *dstptr,
int srccols, int srcrows, size_t srcstep,
@ -984,11 +1040,10 @@ void CV_Remap_Test::new_linear_c3(int x, float sx, float sy, const T *srcptr_, T
T pxy[12];
const T *srcptr = srcptr_ + srcstep*iy + ix*3;
WARPAFFINE_SHUFFLE(3);
WARP_SHUFFLE(LINEAR, 3);
warpaffine_linear_calc(3, pxy, dstptr+x*3, sx, sy);
warp_linear_calc(3, pxy, dstptr+x*3, sx, sy);
}
template<typename T>
void CV_Remap_Test::new_linear_c4(int x, float sx, float sy, const T *srcptr_, T *dstptr,
int srccols, int srcrows, size_t srcstep,
@ -1000,13 +1055,14 @@ void CV_Remap_Test::new_linear_c4(int x, float sx, float sy, const T *srcptr_, T
T pxy[16];
const T *srcptr = srcptr_ + srcstep*iy + ix*4;
WARPAFFINE_SHUFFLE(4);
WARP_SHUFFLE(LINEAR, 4);
warpaffine_linear_calc(4, pxy, dstptr+x*4, sx, sy);
warp_linear_calc(4, pxy, dstptr+x*4, sx, sy);
}
template <typename T>
void CV_Remap_Test::new_remap(const Mat &_src, Mat &_dst) {
void CV_Remap_Test::new_remap(const Mat &_src, Mat &_dst, int inter) {
CV_UNUSED(inter);
int src_channels = _src.channels();
CV_CheckTrue(_src.channels() == 1 || _src.channels() == 3 || _src.channels() == 4, "");
CV_CheckTrue(mapx.depth() == CV_32F, "");
@ -1232,7 +1288,7 @@ private:
void warpAffine(const Mat&, Mat&);
template<typename T>
void newWarpAffine(const Mat&, Mat&, const Mat&);
void new_warpAffine(const Mat&, Mat&, const Mat&, int);
};
CV_WarpAffine_Test::CV_WarpAffine_Test() :
@ -1287,8 +1343,9 @@ void CV_WarpAffine_Test::run_reference_func()
}
template<typename T>
void CV_WarpAffine_Test::newWarpAffine(const Mat &_src, Mat &_dst, const Mat &tM)
void CV_WarpAffine_Test::new_warpAffine(const Mat &_src, Mat &_dst, const Mat &tM, int inter)
{
CV_UNUSED(inter);
int num_channels = _dst.channels();
CV_CheckTrue(num_channels == 1 || num_channels == 3 || num_channels == 4, "");
@ -1360,11 +1417,11 @@ void CV_WarpAffine_Test::warpAffine(const Mat& _src, Mat& _dst)
if (inter == INTER_LINEAR) {
int dst_depth = _dst.depth(), dst_channels = _dst.channels();
if (dst_depth == CV_8U && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) {
return newWarpAffine<uint8_t>(_src, _dst, tM);
return new_warpAffine<uint8_t>(_src, _dst, tM, inter);
} else if (dst_depth == CV_16U && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) {
return newWarpAffine<uint16_t>(_src, _dst, tM);
return new_warpAffine<uint16_t>(_src, _dst, tM, inter);
} else if (dst_depth == CV_32F && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) {
return newWarpAffine<float>(_src, _dst, tM);
return new_warpAffine<float>(_src, _dst, tM, inter);
}
}
@ -1420,7 +1477,7 @@ private:
void warpPerspective(const Mat&, Mat&);
template<typename T>
void newWarpPerspective(const Mat&, Mat&, const Mat&);
void new_warpPerspective(const Mat&, Mat&, const Mat&, int);
};
CV_WarpPerspective_Test::CV_WarpPerspective_Test() :
@ -1470,8 +1527,9 @@ void CV_WarpPerspective_Test::run_reference_func()
}
template<typename T>
void CV_WarpPerspective_Test::newWarpPerspective(const Mat &_src, Mat &_dst, const Mat &tM)
void CV_WarpPerspective_Test::new_warpPerspective(const Mat &_src, Mat &_dst, const Mat &tM, int inter)
{
CV_UNUSED(inter);
int num_channels = _dst.channels();
CV_CheckTrue(num_channels == 1 || num_channels == 3 || num_channels == 4, "");
@ -1546,11 +1604,11 @@ void CV_WarpPerspective_Test::warpPerspective(const Mat& _src, Mat& _dst)
if (inter == INTER_LINEAR) {
int dst_depth = _dst.depth(), dst_channels = _dst.channels();
if (dst_depth == CV_8U && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) {
return newWarpPerspective<uint8_t>(_src, _dst, M);
return new_warpPerspective<uint8_t>(_src, _dst, M, inter);
} else if (dst_depth == CV_16U && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) {
return newWarpPerspective<uint16_t>(_src, _dst, M);
return new_warpPerspective<uint16_t>(_src, _dst, M, inter);
} else if (dst_depth == CV_32F && (dst_channels == 1 || dst_channels == 3 || dst_channels == 4)) {
return newWarpPerspective<float>(_src, _dst, M);
return new_warpPerspective<float>(_src, _dst, M, inter);
}
}