Merge pull request #25786 from plctlab:rvp_3rdparty

3rdparty: NDSRVP - Part 1.5: New Interfaces
This commit is contained in:
Alexander Smorkalov 2024-08-05 15:26:31 +03:00 committed by GitHub
commit ecbff5a20c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 829 additions and 517 deletions

View File

@ -1,6 +1,6 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_NDSRVP_CORE_HPP
#define OPENCV_NDSRVP_CORE_HPP

View File

@ -1,18 +1,12 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_NDSRVP_IMGPROC_HPP
#define OPENCV_NDSRVP_IMGPROC_HPP
namespace cv {
// ################ remap ################
void remap(InputArray _src, OutputArray _dst,
InputArray _map1, InputArray _map2,
int interpolation, int borderType, const Scalar& borderValue);
namespace ndsrvp {
enum InterpolationMasks {
@ -36,23 +30,36 @@ int integral(int depth, int sdepth, int sqdepth,
// ################ warpAffine ################
int warpAffine(int src_type,
const uchar* src_data, size_t src_step, int src_width, int src_height,
uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
const double M[6], int interpolation, int borderType, const double borderValue[4]);
int warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
#undef cv_hal_warpAffine
#define cv_hal_warpAffine (cv::ndsrvp::warpAffine)
#undef cv_hal_warpAffineBlocklineNN
#define cv_hal_warpAffineBlocklineNN (cv::ndsrvp::warpAffineBlocklineNN)
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
#undef cv_hal_warpAffineBlockline
#define cv_hal_warpAffineBlockline (cv::ndsrvp::warpAffineBlockline)
// ################ warpPerspective ################
int warpPerspective(int src_type,
const uchar* src_data, size_t src_step, int src_width, int src_height,
uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
const double M[9], int interpolation, int borderType, const double borderValue[4]);
int warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw);
#undef cv_hal_warpPerspective
#define cv_hal_warpPerspective (cv::ndsrvp::warpPerspective)
#undef cv_hal_warpPerspectiveBlocklineNN
#define cv_hal_warpPerspectiveBlocklineNN (cv::ndsrvp::warpPerspectiveBlocklineNN)
int warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw);
#undef cv_hal_warpPerspectiveBlockline
#define cv_hal_warpPerspectiveBlockline (cv::ndsrvp::warpPerspectiveBlockline)
// ################ remap ################
int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
uchar *dst_data, size_t dst_step, int dst_width, int dst_height, float* mapx, size_t mapx_step,
float* mapy, size_t mapy_step, int interpolation, int border_type, const double border_value[4]);
#undef cv_hal_remap32f
#define cv_hal_remap32f (cv::ndsrvp::remap32f)
// ################ threshold ################

View File

@ -1,13 +1,14 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_NDSRVP_HAL_HPP
#define OPENCV_NDSRVP_HAL_HPP
#include "opencv2/core/mat.hpp"
#include <nds_intrinsic.h>
#include "opencv2/core/hal/interface.h"
#include "include/core.hpp"
#include "include/imgproc.hpp"
#include "include/features2d.hpp"

78
3rdparty/ndsrvp/src/cvutils.cpp vendored Normal file
View File

@ -0,0 +1,78 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "cvutils.hpp"
namespace cv {
namespace ndsrvp {
// fastMalloc
// [0][1][2][3][4][5][6][7][8][9]
// ^udata
// ^adata
// ^adata[-1] == udata
void* fastMalloc(size_t size)
{
uchar* udata = (uchar*)malloc(size + sizeof(void*) + CV_MALLOC_ALIGN);
if(!udata)
ndsrvp_error(Error::StsNoMem, "fastMalloc(): Not enough memory");
uchar** adata = (uchar**)align((size_t)((uchar**)udata + 1), CV_MALLOC_ALIGN);
adata[-1] = udata;
return adata;
}
void fastFree(void* ptr)
{
if(ptr)
{
uchar* udata = ((uchar**)ptr)[-1];
if(!(udata < (uchar*)ptr && ((uchar*)ptr - udata) <= (ptrdiff_t)(sizeof(void*) + CV_MALLOC_ALIGN)))
ndsrvp_error(Error::StsBadArg, "fastFree(): Invalid memory block");
free(udata);
}
}
// borderInterpolate
int borderInterpolate(int p, int len, int borderType)
{
if( (unsigned)p < (unsigned)len )
;
else if( borderType == CV_HAL_BORDER_REPLICATE )
p = p < 0 ? 0 : len - 1;
else if( borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101 )
{
int delta = borderType == CV_HAL_BORDER_REFLECT_101;
if( len == 1 )
return 0;
do
{
if( p < 0 )
p = -p - 1 + delta;
else
p = len - 1 - (p - len) - delta;
}
while( (unsigned)p >= (unsigned)len );
}
else if( borderType == CV_HAL_BORDER_WRAP )
{
ndsrvp_assert(len > 0);
if( p < 0 )
p -= ((p - len + 1) / len) * len;
if( p >= len )
p %= len;
}
else if( borderType == CV_HAL_BORDER_CONSTANT )
p = -1;
else
ndsrvp_error(Error::StsBadArg, "borderInterpolate(): Unknown/unsupported border type");
return p;
}
} // namespace ndsrvp
} // namespace cv

108
3rdparty/ndsrvp/src/cvutils.hpp vendored Normal file
View File

@ -0,0 +1,108 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_NDSRVP_CVUTILS_HPP
#define OPENCV_NDSRVP_CVUTILS_HPP
#include <nds_intrinsic.h>
#include "opencv2/core/hal/interface.h"
#include <cstring>
#include <cmath>
#include <iostream>
#include <string>
#include <array>
#include <climits>
#include <algorithm>
// misc functions that not exposed to public interface
namespace cv {
namespace ndsrvp {
void* fastMalloc(size_t size);
void fastFree(void* ptr);
int borderInterpolate(int p, int len, int borderType);
#ifndef MAX
# define MAX(a,b) ((a) < (b) ? (b) : (a))
#endif
#define CV_MAT_CN_MASK ((CV_CN_MAX - 1) << CV_CN_SHIFT)
#define CV_MAT_CN(flags) ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
#define CV_MALLOC_ALIGN 64
// error codes
enum Error{
StsNoMem = -4,
StsBadArg = -5,
StsAssert = -215
};
// output error
#define ndsrvp_assert(expr) { if(!(expr)) ndsrvp_error(Error::StsAssert, std::string(#expr)); }
inline void ndsrvp_error(int code, std::string msg = "")
{
std::cerr << "NDSRVP Error: code " << code << std::endl;
if(!msg.empty())
std::cerr << msg << std::endl;
if(code < 0)
throw code;
}
// clip & vclip
inline int clip(int x, int a, int b)
{
return x >= a ? (x < b ? x : b - 1) : a;
}
inline int32x2_t vclip(int32x2_t x, int32x2_t a, int32x2_t b)
{
return (int32x2_t)__nds__bpick((long)a, __nds__bpick((long)(b - 1), (long)x, (long)(x < b)), (long)(x >= a));
}
// saturate
template<typename _Tp> static inline _Tp saturate_cast(int v) { return _Tp(v); }
template<typename _Tp> static inline _Tp saturate_cast(float v) { return _Tp(v); }
template<typename _Tp> static inline _Tp saturate_cast(double v) { return _Tp(v); }
template<> inline uchar saturate_cast<uchar>(int v) { return __nds__uclip32(v, 8); }
template<> inline uchar saturate_cast<uchar>(float v) { return saturate_cast<uchar>((int)lrintf(v)); }
template<> inline uchar saturate_cast<uchar>(double v) { return saturate_cast<uchar>((int)lrint(v)); }
template<> inline char saturate_cast<char>(int v) { return __nds__sclip32(v, 7); }
template<> inline char saturate_cast<char>(float v) { return saturate_cast<char>((int)lrintf(v)); }
template<> inline char saturate_cast<char>(double v) { return saturate_cast<char>((int)lrint(v)); }
template<> inline ushort saturate_cast<ushort>(int v) { return __nds__uclip32(v, 16); }
template<> inline ushort saturate_cast<ushort>(float v) { return saturate_cast<ushort>((int)lrintf(v)); }
template<> inline ushort saturate_cast<ushort>(double v) { return saturate_cast<ushort>((int)lrint(v)); }
template<> inline short saturate_cast<short>(int v) { return __nds__sclip32(v, 15); }
template<> inline short saturate_cast<short>(float v) { return saturate_cast<short>((int)lrintf(v)); }
template<> inline short saturate_cast<short>(double v) { return saturate_cast<short>((int)lrint(v)); }
template<> inline int saturate_cast<int>(float v) { return (int)lrintf(v); }
template<> inline int saturate_cast<int>(double v) { return (int)lrint(v); }
// align
inline long align(size_t v, int n)
{
return (v + n - 1) & -n;
}
} // namespace ndsrvp
} // namespace cv
#endif

View File

@ -3,6 +3,8 @@
// of this distribution and at http://opencv.org/license.html.
#include "ndsrvp_hal.hpp"
#include "opencv2/imgproc/hal/interface.h"
#include "cvutils.hpp"
namespace cv {

188
3rdparty/ndsrvp/src/remap.cpp vendored Normal file
View File

@ -0,0 +1,188 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "ndsrvp_hal.hpp"
#include "opencv2/imgproc/hal/interface.h"
#include "cvutils.hpp"
namespace cv {
namespace ndsrvp {
int remap32f(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
uchar* dst_data, size_t dst_step, int dst_width, int dst_height, float* mapx, size_t mapx_step,
float* mapy, size_t mapy_step, int interpolation, int border_type, const double border_value[4])
{
const bool isRelative = ((interpolation & CV_HAL_WARP_RELATIVE_MAP) != 0);
interpolation &= ~CV_HAL_WARP_RELATIVE_MAP;
if( interpolation == CV_HAL_INTER_AREA )
interpolation = CV_HAL_INTER_LINEAR;
if( interpolation != CV_HAL_INTER_NEAREST )
return CV_HAL_ERROR_NOT_IMPLEMENTED;
// only CV_8U
if( (src_type & CV_MAT_DEPTH_MASK) != CV_8U )
return CV_HAL_ERROR_NOT_IMPLEMENTED;
int cn = CV_MAT_CN(src_type);
src_step /= sizeof(uchar);
dst_step /= sizeof(uchar);
// mapping CV_32FC1
mapx_step /= sizeof(float);
mapy_step /= sizeof(float);
// border
uchar border_const[CV_CN_MAX];
for( int k = 0; k < CV_CN_MAX; k++ )
border_const[k] = saturate_cast<uchar>(border_value[k & 3]);
// divide into blocks
const int BLOCK_SIZE = 1024;
int x, y, x1, y1;
std::array<short, BLOCK_SIZE * BLOCK_SIZE * 2> aXY;
short* XY = aXY.data();
size_t XY_step = BLOCK_SIZE * 2;
// vectorize
const int32x2_t src_wh = {src_width, src_height};
const int32x2_t arr_index = {cn, (int)src_step};
for (y = 0; y < dst_height; y += BLOCK_SIZE)
{
int dy = std::min(BLOCK_SIZE, dst_height - y);
for (x = 0; x < dst_width; x += BLOCK_SIZE)
{
const int off_y = isRelative ? y : 0;
const int off_x = isRelative ? x : 0;
const int32x2_t voff = {off_x, off_y};
int dx = std::min(BLOCK_SIZE, dst_width - x);
// prepare mapping data XY
for (y1 = 0; y1 < dy; y1++)
{
short* rXY = XY + y1 * XY_step;
const float* sX = mapx + (y + y1) * mapx_step + x;
const float* sY = mapy + (y + y1) * mapy_step + x;
for (x1 = 0; x1 < dx; x1++)
{
rXY[x1 * 2] = saturate_cast<short>(sX[x1]);
rXY[x1 * 2 + 1] = saturate_cast<short>(sY[x1]);
}
}
// precalulate offset
if(isRelative)
{
int16x8_t voff_x;
int16x8_t voff_y = {0, 0, 1, 0, 2, 0, 3, 0};
int16x8_t vones_x = {4, 0, 4, 0, 4, 0, 4, 0};
int16x8_t vones_y = {0, 1, 0, 1, 0, 1, 0, 1};
for(y1 = 0; y1 < BLOCK_SIZE; y1++, voff_y += vones_y)
{
int16x8_t* vrXY = (int16x8_t*)(XY + y1 * XY_step);
for(x1 = 0, voff_x = voff_y; x1 < BLOCK_SIZE; x1 += 4, vrXY++, voff_x += vones_x)
{
*vrXY += voff_x;
}
}
}
// process the block
for( y1 = 0; y1 < dy; y1++ )
{
uchar* dst_row = dst_data + (y + y1) * dst_step + x * cn;
const short* rXY = XY + y1 * XY_step;
if( cn == 1 )
{
for( x1 = 0; x1 < dx; x1++ )
{
int32x2_t vsxy = (int32x2_t){rXY[x1 * 2], rXY[x1 * 2 + 1]} + voff;
if( (long)((uint32x2_t)vsxy < (uint32x2_t)src_wh) == -1 )
dst_row[x1] = src_data[__nds__v_smar64(0, vsxy, arr_index)];
else
{
if( border_type == CV_HAL_BORDER_REPLICATE )
{
vsxy = vclip(vsxy, (int32x2_t){0, 0}, src_wh);
dst_row[x1] = src_data[__nds__v_smar64(0, vsxy, arr_index)];
}
else if( border_type == CV_HAL_BORDER_CONSTANT )
dst_row[x1] = border_const[0];
else if( border_type != CV_HAL_BORDER_TRANSPARENT )
{
vsxy[0] = borderInterpolate(vsxy[0], src_width, border_type);
vsxy[1] = borderInterpolate(vsxy[1], src_height, border_type);
dst_row[x1] = src_data[__nds__v_smar64(0, vsxy, arr_index)];
}
}
}
}
else
{
uchar* dst_ptr = dst_row;
for(x1 = 0; x1 < dx; x1++, dst_ptr += cn )
{
int32x2_t vsxy = (int32x2_t){rXY[x1 * 2], rXY[x1 * 2 + 1]} + voff;
const uchar *src_ptr;
if( (long)((uint32x2_t)vsxy < (uint32x2_t)src_wh) == -1 )
{
if( cn == 3 )
{
src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
dst_ptr[0] = src_ptr[0]; dst_ptr[1] = src_ptr[1]; dst_ptr[2] = src_ptr[2];
// performance loss, commented out
// *(unsigned*)dst_ptr = __nds__bpick(*(unsigned*)dst_ptr, *(unsigned*)src_ptr, 0xFF000000);
}
else if( cn == 4 )
{
src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
*(uint8x4_t*)dst_ptr = *(uint8x4_t*)src_ptr;
}
else
{
src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
int k = cn;
for(; k >= 8; k -= 8, dst_ptr += 8, src_ptr += 8)
*(uint8x8_t*)dst_ptr = *(uint8x8_t*)src_ptr;
while( k-- )
dst_ptr[k] = src_ptr[k];
}
}
else if( border_type != CV_HAL_BORDER_TRANSPARENT )
{
if( border_type == CV_HAL_BORDER_REPLICATE )
{
vsxy = vclip(vsxy, (int32x2_t){0, 0}, src_wh);
src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
}
else if( border_type == CV_HAL_BORDER_CONSTANT )
src_ptr = &border_const[0];
else
{
vsxy[0] = borderInterpolate(vsxy[0], src_width, border_type);
vsxy[1] = borderInterpolate(vsxy[1], src_height, border_type);
src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
}
int k = cn;
for(; k >= 8; k -= 8, dst_ptr += 8, src_ptr += 8)
*(uint8x8_t*)dst_ptr = *(uint8x8_t*)src_ptr;
while( k-- )
dst_ptr[k] = src_ptr[k];
}
}
}
}
}
}
return CV_HAL_ERROR_OK;
}
} // namespace ndsrvp
} // namespace cv

View File

@ -4,65 +4,44 @@
#include "ndsrvp_hal.hpp"
#include "opencv2/imgproc/hal/interface.h"
#include "cvutils.hpp"
namespace cv {
namespace ndsrvp {
template <typename type, typename vtype>
class operators_threshold_t {
public:
virtual ~operators_threshold_t() {};
virtual inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
{
(void)src;
(void)thresh;
(void)maxval;
CV_Error(cv::Error::StsBadArg, "");
return vtype();
}
virtual inline type scalar(const type& src, const type& thresh, const type& maxval)
{
(void)src;
(void)thresh;
(void)maxval;
CV_Error(cv::Error::StsBadArg, "");
return type();
}
};
template <typename type, typename vtype>
class opThreshBinary : public operators_threshold_t<type, vtype> {
inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
struct opThreshBinary_t {
inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
{
return (vtype)__nds__bpick((long)maxval, (long)0, (long)(src > thresh));
}
inline type scalar(const type& src, const type& thresh, const type& maxval) override
inline type scalar(const type& src, const type& thresh, const type& maxval)
{
return src > thresh ? maxval : 0;
}
};
template <typename type, typename vtype>
class opThreshBinaryInv : public operators_threshold_t<type, vtype> {
inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
struct opThreshBinaryInv_t {
inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
{
return (vtype)__nds__bpick((long)0, (long)maxval, (long)(src > thresh));
}
inline type scalar(const type& src, const type& thresh, const type& maxval) override
inline type scalar(const type& src, const type& thresh, const type& maxval)
{
return src > thresh ? 0 : maxval;
}
};
template <typename type, typename vtype>
class opThreshTrunc : public operators_threshold_t<type, vtype> {
inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
struct opThreshTrunc_t {
inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
{
(void)maxval;
return (vtype)__nds__bpick((long)thresh, (long)src, (long)(src > thresh));
}
inline type scalar(const type& src, const type& thresh, const type& maxval) override
inline type scalar(const type& src, const type& thresh, const type& maxval)
{
(void)maxval;
return src > thresh ? thresh : src;
@ -70,13 +49,13 @@ class opThreshTrunc : public operators_threshold_t<type, vtype> {
};
template <typename type, typename vtype>
class opThreshToZero : public operators_threshold_t<type, vtype> {
inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
struct opThreshToZero_t {
inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
{
(void)maxval;
return (vtype)__nds__bpick((long)src, (long)0, (long)(src > thresh));
}
inline type scalar(const type& src, const type& thresh, const type& maxval) override
inline type scalar(const type& src, const type& thresh, const type& maxval)
{
(void)maxval;
return src > thresh ? src : 0;
@ -84,29 +63,36 @@ class opThreshToZero : public operators_threshold_t<type, vtype> {
};
template <typename type, typename vtype>
class opThreshToZeroInv : public operators_threshold_t<type, vtype> {
inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
struct opThreshToZeroInv_t {
inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
{
(void)maxval;
return (vtype)__nds__bpick((long)0, (long)src, (long)(src > thresh));
}
inline type scalar(const type& src, const type& thresh, const type& maxval) override
inline type scalar(const type& src, const type& thresh, const type& maxval)
{
(void)maxval;
return src > thresh ? 0 : src;
}
};
template <typename type, typename vtype, int nlane>
static void threshold_op(const type* src_data, size_t src_step,
type* dst_data, size_t dst_step,
template <typename type, typename vtype, int nlane,
template <typename ttype, typename vttype> typename opThresh_t>
static inline void threshold_op(const uchar* src, size_t src_step,
uchar* dst, size_t dst_step,
int width, int height, int cn,
type thresh, type maxval, int thtype)
double thresh_d, double maxval_d)
{
int i, j;
width *= cn;
type* src_data = (type*)src;
type* dst_data = (type*)dst;
src_step /= sizeof(type);
dst_step /= sizeof(type);
type thresh = saturate_cast<type>(thresh_d);
type maxval = saturate_cast<type>(maxval_d);
vtype vthresh;
vtype vmaxval;
for (i = 0; i < nlane; i++) {
@ -114,62 +100,63 @@ static void threshold_op(const type* src_data, size_t src_step,
vmaxval[i] = maxval;
}
operators_threshold_t<type, vtype>* op;
switch (thtype) {
case CV_HAL_THRESH_BINARY:
op = new opThreshBinary<type, vtype>();
break;
case CV_HAL_THRESH_BINARY_INV:
op = new opThreshBinaryInv<type, vtype>();
break;
case CV_HAL_THRESH_TRUNC:
op = new opThreshTrunc<type, vtype>();
break;
case CV_HAL_THRESH_TOZERO:
op = new opThreshToZero<type, vtype>();
break;
case CV_HAL_THRESH_TOZERO_INV:
op = new opThreshToZeroInv<type, vtype>();
break;
default:
CV_Error(cv::Error::StsBadArg, "");
return;
}
opThresh_t<type, vtype> opThresh;
for (i = 0; i < height; i++, src_data += src_step, dst_data += dst_step) {
for (j = 0; j <= width - nlane; j += nlane) {
vtype vs = *(vtype*)(src_data + j);
*(vtype*)(dst_data + j) = op->vector(vs, vthresh, vmaxval);
*(vtype*)(dst_data + j) = opThresh.vector(*(vtype*)(src_data + j), vthresh, vmaxval);
}
for (; j < width; j++) {
dst_data[j] = op->scalar(src_data[j], thresh, maxval);
dst_data[j] = opThresh.scalar(src_data[j], thresh, maxval);
}
}
delete op;
return;
}
typedef void (*ThreshFunc)(const uchar* src_data, size_t src_step,
uchar* dst_data, size_t dst_step,
int width, int height, int cn,
double thresh, double maxval);
int threshold(const uchar* src_data, size_t src_step,
uchar* dst_data, size_t dst_step,
int width, int height, int depth, int cn,
double thresh, double maxValue, int thresholdType)
{
if (width <= 255 && height <= 255) // slower at small size
static ThreshFunc thfuncs[4][5] =
{
{
threshold_op<uchar, uint8x8_t, 8, opThreshBinary_t>,
threshold_op<uchar, uint8x8_t, 8, opThreshBinaryInv_t>,
threshold_op<uchar, uint8x8_t, 8, opThreshTrunc_t>,
threshold_op<uchar, uint8x8_t, 8, opThreshToZero_t>,
threshold_op<uchar, uint8x8_t, 8, opThreshToZeroInv_t> },
{
threshold_op<char, int8x8_t, 8, opThreshBinary_t>,
threshold_op<char, int8x8_t, 8, opThreshBinaryInv_t>,
threshold_op<char, int8x8_t, 8, opThreshTrunc_t>,
threshold_op<char, int8x8_t, 8, opThreshToZero_t>,
threshold_op<char, int8x8_t, 8, opThreshToZeroInv_t> },
{
threshold_op<ushort, uint16x4_t, 4, opThreshBinary_t>,
threshold_op<ushort, uint16x4_t, 4, opThreshBinaryInv_t>,
threshold_op<ushort, uint16x4_t, 4, opThreshTrunc_t>,
threshold_op<ushort, uint16x4_t, 4, opThreshToZero_t>,
threshold_op<ushort, uint16x4_t, 4, opThreshToZeroInv_t> },
{
threshold_op<short, int16x4_t, 4, opThreshBinary_t>,
threshold_op<short, int16x4_t, 4, opThreshBinaryInv_t>,
threshold_op<short, int16x4_t, 4, opThreshTrunc_t>,
threshold_op<short, int16x4_t, 4, opThreshToZero_t>,
threshold_op<short, int16x4_t, 4, opThreshToZeroInv_t> }
};
if(depth < 0 || depth > 3 || thresholdType < 0 || thresholdType > 4 || (width < 256 && height < 256))
return CV_HAL_ERROR_NOT_IMPLEMENTED;
if (depth == CV_8U) {
threshold_op<uchar, uint8x8_t, 8>((uchar*)src_data, src_step, (uchar*)dst_data, dst_step, width, height, cn, (uchar)thresh, (uchar)maxValue, thresholdType);
return CV_HAL_ERROR_OK;
} else if (depth == CV_16S) {
threshold_op<short, int16x4_t, 4>((short*)src_data, src_step, (short*)dst_data, dst_step, width, height, cn, (short)thresh, (short)maxValue, thresholdType);
return CV_HAL_ERROR_OK;
} else if (depth == CV_16U) {
threshold_op<ushort, uint16x4_t, 4>((ushort*)src_data, src_step, (ushort*)dst_data, dst_step, width, height, cn, (ushort)thresh, (ushort)maxValue, thresholdType);
return CV_HAL_ERROR_OK;
} else {
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
return CV_HAL_ERROR_NOT_IMPLEMENTED;
thfuncs[depth][thresholdType](src_data, src_step, dst_data, dst_step, width, height, cn, thresh, maxValue);
return CV_HAL_ERROR_OK;
}
} // namespace ndsrvp

View File

@ -3,148 +3,68 @@
// of this distribution and at http://opencv.org/license.html.
#include "ndsrvp_hal.hpp"
#include "opencv2/core.hpp"
#include "opencv2/imgproc/hal/interface.h"
#include "cvutils.hpp"
namespace cv {
namespace ndsrvp {
class WarpAffineInvoker : public ParallelLoopBody {
public:
WarpAffineInvoker(const Mat& _src, Mat& _dst, int _interpolation, int _borderType,
const Scalar& _borderValue, int* _adelta, int* _bdelta, const double* _M)
: ParallelLoopBody()
, src(_src)
, dst(_dst)
, interpolation(_interpolation)
, borderType(_borderType)
, borderValue(_borderValue)
, adelta(_adelta)
, bdelta(_bdelta)
, M(_M)
{
}
virtual void operator()(const Range& range) const CV_OVERRIDE
{
const int BLOCK_SZ = 64;
AutoBuffer<short, 0> __XY(BLOCK_SZ * BLOCK_SZ * 2), __A(BLOCK_SZ * BLOCK_SZ);
short *XY = __XY.data(), *A = __A.data();
const int AB_BITS = MAX(10, (int)INTER_BITS);
const int AB_SCALE = 1 << AB_BITS;
int round_delta = interpolation == CV_HAL_INTER_NEAREST ? AB_SCALE / 2 : AB_SCALE / INTER_TAB_SIZE / 2, x, y, x1, y1;
int bh0 = std::min(BLOCK_SZ / 2, dst.rows);
int bw0 = std::min(BLOCK_SZ * BLOCK_SZ / bh0, dst.cols);
bh0 = std::min(BLOCK_SZ * BLOCK_SZ / bw0, dst.rows);
for (y = range.start; y < range.end; y += bh0) {
for (x = 0; x < dst.cols; x += bw0) {
int bw = std::min(bw0, dst.cols - x);
int bh = std::min(bh0, range.end - y);
Mat _XY(bh, bw, CV_16SC2, XY);
Mat dpart(dst, Rect(x, y, bw, bh));
for (y1 = 0; y1 < bh; y1++) {
short* xy = XY + y1 * bw * 2;
int X0 = saturate_cast<int>((M[1] * (y + y1) + M[2]) * AB_SCALE) + round_delta;
int Y0 = saturate_cast<int>((M[4] * (y + y1) + M[5]) * AB_SCALE) + round_delta;
if (interpolation == CV_HAL_INTER_NEAREST) {
x1 = 0;
for (; x1 < bw; x1 += 2) {
int32x2_t vX = { X0 + adelta[x + x1], X0 + adelta[x + x1 + 1] };
int32x2_t vY = { Y0 + bdelta[x + x1], Y0 + bdelta[x + x1 + 1] };
vX = __nds__v_sclip32(__nds__v_sra32(vX, AB_BITS), 15);
vY = __nds__v_sclip32(__nds__v_sra32(vY, AB_BITS), 15);
*(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
}
for (; x1 < bw; x1++) {
int X = (X0 + adelta[x + x1]) >> AB_BITS;
int Y = (Y0 + bdelta[x + x1]) >> AB_BITS;
xy[x1 * 2] = saturate_cast<short>(X);
xy[x1 * 2 + 1] = saturate_cast<short>(Y);
}
} else {
short* alpha = A + y1 * bw;
x1 = 0;
const int INTER_MASK = INTER_TAB_SIZE - 1;
const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
for (; x1 < bw; x1 += 2) {
int32x2_t vX = { X0 + adelta[x + x1], X0 + adelta[x + x1 + 1] };
int32x2_t vY = { Y0 + bdelta[x + x1], Y0 + bdelta[x + x1 + 1] };
vX = __nds__v_sra32(vX, (AB_BITS - INTER_BITS));
vY = __nds__v_sra32(vY, (AB_BITS - INTER_BITS));
int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
*(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
*(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
}
for (; x1 < bw; x1++) {
int X = (X0 + adelta[x + x1]) >> (AB_BITS - INTER_BITS);
int Y = (Y0 + bdelta[x + x1]) >> (AB_BITS - INTER_BITS);
xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1)));
}
}
}
if (interpolation == CV_HAL_INTER_NEAREST)
remap(src, dpart, _XY, Mat(), interpolation, borderType, borderValue);
else {
Mat _matA(bh, bw, CV_16U, A);
remap(src, dpart, _XY, _matA, interpolation, borderType, borderValue);
}
}
}
}
private:
Mat src;
Mat dst;
int interpolation, borderType;
Scalar borderValue;
int *adelta, *bdelta;
const double* M;
};
int warpAffine(int src_type,
const uchar* src_data, size_t src_step, int src_width, int src_height,
uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
const double M[6], int interpolation, int borderType, const double borderValue[4])
int warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
{
Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
int x;
AutoBuffer<int> _abdelta(dst.cols * 2);
int *adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
const int AB_BITS = MAX(10, (int)INTER_BITS);
const int AB_SCALE = 1 << AB_BITS;
int x1 = 0;
for (x = 0; x < dst.cols; x++) {
adelta[x] = saturate_cast<int>(M[0] * x * AB_SCALE);
bdelta[x] = saturate_cast<int>(M[3] * x * AB_SCALE);
for (; x1 < bw; x1 += 2) {
int32x2_t vX = { X0 + adelta[x1], X0 + adelta[x1 + 1] };
int32x2_t vY = { Y0 + bdelta[x1], Y0 + bdelta[x1 + 1] };
vX = __nds__v_sclip32(__nds__v_sra32(vX, AB_BITS), 15);
vY = __nds__v_sclip32(__nds__v_sra32(vY, AB_BITS), 15);
*(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
}
for (; x1 < bw; x1++) {
int X = X0 + adelta[x1];
int Y = Y0 + bdelta[x1];
xy[x1 * 2] = saturate_cast<short>(X);
xy[x1 * 2 + 1] = saturate_cast<short>(Y);
}
return CV_HAL_ERROR_OK;
}
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
{
const int AB_BITS = MAX(10, (int)INTER_BITS);
int x1 = 0;
const int INTER_MASK = INTER_TAB_SIZE - 1;
const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
for (; x1 < bw; x1 += 2) {
int32x2_t vX = { X0 + adelta[x1], X0 + adelta[x1 + 1] };
int32x2_t vY = { Y0 + bdelta[x1], Y0 + bdelta[x1 + 1] };
vX = __nds__v_sra32(vX, (AB_BITS - INTER_BITS));
vY = __nds__v_sra32(vY, (AB_BITS - INTER_BITS));
int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
*(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
*(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
}
for (; x1 < bw; x1++) {
int X = X0 + adelta[x1];
int Y = Y0 + bdelta[x1];
xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
alpha[x1] = (short)((Y & INTER_MASK) * INTER_TAB_SIZE + (X & INTER_MASK));
}
Range range(0, dst.rows);
WarpAffineInvoker invoker(src, dst, interpolation, borderType,
Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]),
adelta, bdelta, M);
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
return CV_HAL_ERROR_OK;
}

View File

@ -3,154 +3,90 @@
// of this distribution and at http://opencv.org/license.html.
#include "ndsrvp_hal.hpp"
#include "opencv2/core.hpp"
#include "opencv2/imgproc/hal/interface.h"
#include "cvutils.hpp"
namespace cv {
namespace ndsrvp {
class WarpPerspectiveInvoker : public ParallelLoopBody {
public:
WarpPerspectiveInvoker(const Mat& _src, Mat& _dst, const double* _M, int _interpolation,
int _borderType, const Scalar& _borderValue)
: ParallelLoopBody()
, src(_src)
, dst(_dst)
, M(_M)
, interpolation(_interpolation)
, borderType(_borderType)
, borderValue(_borderValue)
{
}
virtual void operator()(const Range& range) const CV_OVERRIDE
{
const int BLOCK_SZ = 32;
short XY[BLOCK_SZ * BLOCK_SZ * 2], A[BLOCK_SZ * BLOCK_SZ];
int x, y, y1, width = dst.cols, height = dst.rows;
int bh0 = std::min(BLOCK_SZ / 2, height);
int bw0 = std::min(BLOCK_SZ * BLOCK_SZ / bh0, width);
bh0 = std::min(BLOCK_SZ * BLOCK_SZ / bw0, height);
for (y = range.start; y < range.end; y += bh0) {
for (x = 0; x < width; x += bw0) {
int bw = std::min(bw0, width - x);
int bh = std::min(bh0, range.end - y); // height
Mat _XY(bh, bw, CV_16SC2, XY);
Mat dpart(dst, Rect(x, y, bw, bh));
for (y1 = 0; y1 < bh; y1++) {
short* xy = XY + y1 * bw * 2;
double X0 = M[0] * x + M[1] * (y + y1) + M[2];
double Y0 = M[3] * x + M[4] * (y + y1) + M[5];
double W0 = M[6] * x + M[7] * (y + y1) + M[8];
if (interpolation == CV_HAL_INTER_NEAREST) {
int x1 = 0;
for (; x1 < bw; x1 += 2) {
double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
W1 = W1 ? 1. / W1 : 0;
W2 = W2 ? 1. / W2 : 0;
double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
vX = __nds__v_sclip32(vX, 15);
vY = __nds__v_sclip32(vY, 15);
*(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
}
for (; x1 < bw; x1++) {
double W = W0 + M[6] * x1;
W = W ? 1. / W : 0;
double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
int X = saturate_cast<int>(fX);
int Y = saturate_cast<int>(fY);
xy[x1 * 2] = saturate_cast<short>(X);
xy[x1 * 2 + 1] = saturate_cast<short>(Y);
}
} else {
short* alpha = A + y1 * bw;
int x1 = 0;
const int INTER_MASK = INTER_TAB_SIZE - 1;
const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
for (; x1 < bw; x1 += 2) {
double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
W1 = W1 ? INTER_TAB_SIZE / W1 : 0;
W2 = W2 ? INTER_TAB_SIZE / W2 : 0;
double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
*(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
*(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
}
for (; x1 < bw; x1++) {
double W = W0 + M[6] * x1;
W = W ? INTER_TAB_SIZE / W : 0;
double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
int X = saturate_cast<int>(fX);
int Y = saturate_cast<int>(fY);
xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1)));
}
}
}
if (interpolation == CV_HAL_INTER_NEAREST)
remap(src, dpart, _XY, Mat(), interpolation, borderType, borderValue);
else {
Mat _matA(bh, bw, CV_16U, A);
remap(src, dpart, _XY, _matA, interpolation, borderType, borderValue);
}
}
}
}
private:
Mat src;
Mat dst;
const double* M;
int interpolation, borderType;
Scalar borderValue;
};
int warpPerspective(int src_type,
const uchar* src_data, size_t src_step, int src_width, int src_height,
uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
const double M[9], int interpolation, int borderType, const double borderValue[4])
int warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw)
{
Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
int x1 = 0;
for (; x1 < bw; x1 += 2) {
double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
W1 = W1 ? 1. / W1 : 0;
W2 = W2 ? 1. / W2 : 0;
double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
vX = __nds__v_sclip32(vX, 15);
vY = __nds__v_sclip32(vY, 15);
*(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
}
for (; x1 < bw; x1++) {
double W = W0 + M[6] * x1;
W = W ? 1. / W : 0;
double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
int X = saturate_cast<int>(fX);
int Y = saturate_cast<int>(fY);
xy[x1 * 2] = saturate_cast<short>(X);
xy[x1 * 2 + 1] = saturate_cast<short>(Y);
}
return CV_HAL_ERROR_OK;
}
int warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw)
{
int x1 = 0;
const int INTER_MASK = INTER_TAB_SIZE - 1;
const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
for (; x1 < bw; x1 += 2) {
double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
W1 = W1 ? INTER_TAB_SIZE / W1 : 0;
W2 = W2 ? INTER_TAB_SIZE / W2 : 0;
double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
*(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
*(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
}
for (; x1 < bw; x1++) {
double W = W0 + M[6] * x1;
W = W ? INTER_TAB_SIZE / W : 0;
double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
int X = saturate_cast<int>(fX);
int Y = saturate_cast<int>(fY);
xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
alpha[x1] = (short)((Y & INTER_MASK) * INTER_TAB_SIZE + (X & INTER_MASK));
}
Range range(0, dst.rows);
WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]));
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
return CV_HAL_ERROR_OK;
}

View File

@ -1040,7 +1040,7 @@ foreach(hal ${OpenCV_HAL})
ocv_hal_register(NDSRVP_HAL_LIBRARIES NDSRVP_HAL_HEADERS NDSRVP_HAL_INCLUDE_DIRS)
list(APPEND OpenCV_USED_HAL "ndsrvp (ver ${NDSRVP_HAL_VERSION})")
else()
message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not open, disabling ndsrvp...")
message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not enabled, disabling ndsrvp...")
endif()
elseif(hal STREQUAL "halrvv")
if(";${CPU_BASELINE_FINAL};" MATCHES ";RVV;")

View File

@ -108,11 +108,19 @@ CV_EXPORTS void warpAffine(int src_type,
uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
const double M[6], int interpolation, int borderType, const double borderValue[4]);
CV_EXPORTS void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
CV_EXPORTS void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
CV_EXPORTS void warpPerspective(int src_type,
const uchar * src_data, size_t src_step, int src_width, int src_height,
uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
const double M[9], int interpolation, int borderType, const double borderValue[4]);
CV_EXPORTS void warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw);
CV_EXPORTS void warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw);
CV_EXPORTS void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,

View File

@ -12,6 +12,12 @@
#define CV_HAL_INTER_CUBIC 2
#define CV_HAL_INTER_AREA 3
#define CV_HAL_INTER_LANCZOS4 4
#define CV_HAL_INTER_LINEAR_EXACT 5
#define CV_HAL_INTER_NEAREST_EXACT 6
#define CV_HAL_INTER_MAX 7
#define CV_HAL_WARP_FILL_OUTLIERS 8
#define CV_HAL_WARP_INVERSE_MAP 16
#define CV_HAL_WARP_RELATIVE_MAP 32
//! @}
//! @name Morphology operations

View File

@ -273,6 +273,29 @@ inline int hal_ni_resize(int src_type, const uchar *src_data, size_t src_step, i
@sa cv::warpAffine, cv::hal::warpAffine
*/
inline int hal_ni_warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
/**
@brief hal_warpAffineBlocklineNN doing a row of affine transformation
@param adelta input M0 * x array
@param bdelta input M3 * x array
@param xy output (x', y') coordinates
@param X0 input M1 * y + M2 value
@param Y0 input M4 * y + M5 value
@param bw length of the row
@sa cv::warpAffineBlocklineNN, cv::hal::warpAffineBlocklineNN
*/
inline int hal_ni_warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
/**
@brief hal_warpAffineBlockline doing a row of affine transformation
@param adelta input M0 * x array
@param bdelta input M3 * x array
@param xy output (x', y') coordinates
@param alpha output least significant bits of the (x', y') coordinates for interpolation
@param X0 input M1 * y + M2 value
@param Y0 input M4 * y + M5 value
@param bw length of the row
@sa cv::warpAffineBlockline, cv::hal::warpAffineBlockline
*/
inline int hal_ni_warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
/**
@brief hal_warpPerspective
@param src_type source and destination image type
@ -291,11 +314,38 @@ inline int hal_ni_warpAffine(int src_type, const uchar *src_data, size_t src_ste
@sa cv::warpPerspective, cv::hal::warpPerspective
*/
inline int hal_ni_warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
/**
@brief hal_warpPerspectiveBlocklineNN doing a row of perspective transformation
@param M 3x3 matrix with transform coefficients
@param xy output (x', y') coordinates
@param X0 input M0 * x0 + M1 * y + M2 value
@param Y0 input M3 * x0 + M4 * y + M5 value
@param W0 input M6 * x0 + M7 * y + M8 value
@param bw length of the row
@sa cv::warpPerspectiveBlocklineNN, cv::hal::warpPerspectiveBlocklineNN
*/
inline int hal_ni_warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
/**
@brief hal_warpPerspectiveBlockline doing a row of perspective transformation
@param M 3x3 matrix with transform coefficients
@param xy output (x', y') coordinates
@param alpha output least significant bits of the (x', y') coordinates for interpolation
@param X0 input M0 * x0 + M1 * y + M2 value
@param Y0 input M3 * x0 + M4 * y + M5 value
@param W0 input M6 * x0 + M7 * y + M8 value
@param bw length of the row
@sa cv::warpPerspectiveBlockline, cv::hal::warpPerspectiveBlockline
*/
inline int hal_ni_warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
//! @cond IGNORED
#define cv_hal_resize hal_ni_resize
#define cv_hal_warpAffine hal_ni_warpAffine
#define cv_hal_warpAffineBlocklineNN hal_ni_warpAffineBlocklineNN
#define cv_hal_warpAffineBlockline hal_ni_warpAffineBlockline
#define cv_hal_warpPerspective hal_ni_warpPerspective
#define cv_hal_warpPerspectiveBlocklineNN hal_ni_warpPerspectiveBlocklineNN
#define cv_hal_warpPerspectiveBlockline hal_ni_warpPerspectiveBlockline
//! @endcond
/**

View File

@ -2268,16 +2268,7 @@ public:
short *XY = __XY.data(), *A = __A.data();
const int AB_BITS = MAX(10, (int)INTER_BITS);
const int AB_SCALE = 1 << AB_BITS;
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
#if CV_TRY_AVX2
bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
#endif
#if CV_TRY_SSE4_1
bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
#endif
#if CV_TRY_LASX
bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
#endif
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, y1;
int bh0 = std::min(BLOCK_SZ/2, dst.rows);
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
@ -2300,84 +2291,9 @@ public:
int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
if( interpolation == INTER_NEAREST )
{
x1 = 0;
#if CV_TRY_SSE4_1
if( useSSE4_1 )
opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw);
else
#endif
{
#if CV_SIMD128
{
v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
int span = VTraits<v_uint16x8>::vlanes();
for( ; x1 <= bw - span; x1 += span )
{
v_int16x8 v_dst[2];
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
#undef CV_CONVERT_MAP
v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
}
}
#endif
for( ; x1 < bw; x1++ )
{
int X = (X0 + adelta[x+x1]) >> AB_BITS;
int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
xy[x1*2] = saturate_cast<short>(X);
xy[x1*2+1] = saturate_cast<short>(Y);
}
}
}
hal::warpAffineBlocklineNN(adelta + x, bdelta + x, xy, X0, Y0, bw);
else
{
short* alpha = A + y1*bw;
x1 = 0;
#if CV_TRY_AVX2
if ( useAVX2 )
x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
#endif
#if CV_TRY_LASX
if ( useLASX )
x1 = opt_LASX::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
#endif
#if CV_SIMD128
{
v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
int span = VTraits<v_float32x4>::vlanes();
for( ; x1 <= bw - span * 2; x1 += span * 2 )
{
v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1)));
v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1)));
v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1 + span)));
v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1 + span)));
v_int16x8 v_xy[2];
v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
v_int32x4 v_alpha0 = v_or(v_shl<INTER_BITS>(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask));
v_int32x4 v_alpha1 = v_or(v_shl<INTER_BITS>(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask));
v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
}
}
#endif
for( ; x1 < bw; x1++ )
{
int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
(X & (INTER_TAB_SIZE-1)));
}
}
hal::warpAffineBlockline(adelta + x, bdelta + x, xy, A + y1*bw, X0, Y0, bw);
}
if( interpolation == INTER_NEAREST )
@ -2802,6 +2718,97 @@ void warpAffine(int src_type,
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
}
void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
{
CALL_HAL(warpAffineBlocklineNN, cv_hal_warpAffineBlocklineNN, adelta, bdelta, xy, X0, Y0, bw);
const int AB_BITS = MAX(10, (int)INTER_BITS);
int x1 = 0;
#if CV_TRY_SSE4_1
bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
if( useSSE4_1 )
opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta, bdelta, xy, X0, Y0, bw);
else
#endif
{
#if CV_SIMD128
{
v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
int span = VTraits<v_uint16x8>::vlanes();
for( ; x1 <= bw - span; x1 += span )
{
v_int16x8 v_dst[2];
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
v_dst[0] = CV_CONVERT_MAP(adelta, x1, v_X0);
v_dst[1] = CV_CONVERT_MAP(bdelta, x1, v_Y0);
#undef CV_CONVERT_MAP
v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
}
}
#endif
for( ; x1 < bw; x1++ )
{
int X = (X0 + adelta[x1]) >> AB_BITS;
int Y = (Y0 + bdelta[x1]) >> AB_BITS;
xy[x1*2] = saturate_cast<short>(X);
xy[x1*2+1] = saturate_cast<short>(Y);
}
}
}
void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
{
CALL_HAL(warpAffineBlockline, cv_hal_warpAffineBlockline, adelta, bdelta, xy, alpha, X0, Y0, bw);
const int AB_BITS = MAX(10, (int)INTER_BITS);
int x1 = 0;
#if CV_TRY_AVX2
bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
if ( useAVX2 )
x1 = opt_AVX2::warpAffineBlockline(adelta, bdelta, xy, alpha, X0, Y0, bw);
#endif
#if CV_TRY_LASX
bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
if ( useLASX )
x1 = opt_LASX::warpAffineBlockline(adelta, bdelta, xy, alpha, X0, Y0, bw);
#endif
{
#if CV_SIMD128
{
v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
int span = VTraits<v_float32x4>::vlanes();
for( ; x1 <= bw - span * 2; x1 += span * 2 )
{
v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(adelta + x1)));
v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(bdelta + x1)));
v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(adelta + x1 + span)));
v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(bdelta + x1 + span)));
v_int16x8 v_xy[2];
v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
v_int32x4 v_alpha0 = v_or(v_shl<INTER_BITS>(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask));
v_int32x4 v_alpha1 = v_or(v_shl<INTER_BITS>(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask));
v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
}
}
#endif
for( ; x1 < bw; x1++ )
{
int X = (X0 + adelta[x1]) >> (AB_BITS - INTER_BITS);
int Y = (Y0 + bdelta[x1]) >> (AB_BITS - INTER_BITS);
xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
(X & (INTER_TAB_SIZE-1)));
}
}
}
} // hal::
} // cv::
@ -3204,12 +3211,6 @@ public:
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
#if CV_TRY_SSE4_1
Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
if(CV_CPU_HAS_SUPPORT_SSE4_1)
pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
#endif
for( y = range.start; y < range.end; y += bh0 )
{
for( x = 0; x < width; x += bw0 )
@ -3228,57 +3229,9 @@ public:
double W0 = M[6]*x + M[7]*(y + y1) + M[8];
if( interpolation == INTER_NEAREST )
{
#if CV_TRY_SSE4_1
if (pwarp_impl_sse4)
pwarp_impl_sse4->processNN(M, xy, X0, Y0, W0, bw);
else
#endif
#if CV_SIMD128_64F
WarpPerspectiveLine_ProcessNN_CV_SIMD(M, xy, X0, Y0, W0, bw);
#else
for( int x1 = 0; x1 < bw; x1++ )
{
double W = W0 + M[6]*x1;
W = W ? 1./W : 0;
double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
int X = saturate_cast<int>(fX);
int Y = saturate_cast<int>(fY);
xy[x1*2] = saturate_cast<short>(X);
xy[x1*2+1] = saturate_cast<short>(Y);
}
#endif
}
hal::warpPerspectiveBlocklineNN(M, xy, X0, Y0, W0, bw);
else
{
short* alpha = A + y1*bw;
#if CV_TRY_SSE4_1
if (pwarp_impl_sse4)
pwarp_impl_sse4->process(M, xy, alpha, X0, Y0, W0, bw);
else
#endif
#if CV_SIMD128_64F
WarpPerspectiveLine_Process_CV_SIMD(M, xy, alpha, X0, Y0, W0, bw);
#else
for( int x1 = 0; x1 < bw; x1++ )
{
double W = W0 + M[6]*x1;
W = W ? INTER_TAB_SIZE/W : 0;
double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
int X = saturate_cast<int>(fX);
int Y = saturate_cast<int>(fY);
xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
(X & (INTER_TAB_SIZE-1)));
}
#endif
}
hal::warpPerspectiveBlockline(M, xy, A + y1*bw, X0, Y0, W0, bw);
}
if( interpolation == INTER_NEAREST )
@ -3371,6 +3324,74 @@ void warpPerspective(int src_type,
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
}
void warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw)
{
CALL_HAL(warpPerspectiveBlocklineNN, cv_hal_warpPerspectiveBlocklineNN, M, xy, X0, Y0, W0, bw);
#if CV_TRY_SSE4_1
Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
if(CV_CPU_HAS_SUPPORT_SSE4_1)
pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
if (pwarp_impl_sse4)
pwarp_impl_sse4->processNN(M, xy, X0, Y0, W0, bw);
else
#endif
{
#if CV_SIMD128_64F
WarpPerspectiveLine_ProcessNN_CV_SIMD(M, xy, X0, Y0, W0, bw);
#else
for( int x1 = 0; x1 < bw; x1++ )
{
double W = W0 + M[6]*x1;
W = W ? 1./W : 0;
double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
int X = saturate_cast<int>(fX);
int Y = saturate_cast<int>(fY);
xy[x1*2] = saturate_cast<short>(X);
xy[x1*2+1] = saturate_cast<short>(Y);
}
#endif
}
}
void warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw)
{
CALL_HAL(warpPerspectiveBlockline, cv_hal_warpPerspectiveBlockline, M, xy, alpha, X0, Y0, W0, bw);
#if CV_TRY_SSE4_1
Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
if(CV_CPU_HAS_SUPPORT_SSE4_1)
pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
if (pwarp_impl_sse4)
pwarp_impl_sse4->process(M, xy, alpha, X0, Y0, W0, bw);
else
#endif
{
#if CV_SIMD128_64F
WarpPerspectiveLine_Process_CV_SIMD(M, xy, alpha, X0, Y0, W0, bw);
#else
for( int x1 = 0; x1 < bw; x1++ )
{
double W = W0 + M[6]*x1;
W = W ? INTER_TAB_SIZE/W : 0;
double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
int X = saturate_cast<int>(fX);
int Y = saturate_cast<int>(fY);
xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
(X & (INTER_TAB_SIZE-1)));
}
#endif
}
}
} // hal::
} // cv::