mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
Merge pull request #24412 from vrabaud:inter_area1
Speed up line merging in INTER_AREA #24412 This provides a 10 to 20% speed-up. Related perf test fix: https://github.com/opencv/opencv/pull/24417 This is a split of https://github.com/opencv/opencv/pull/23525 that will be updated to only deal with column merging. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
a9664abb57
commit
c96f48e7c9
@ -3019,6 +3019,111 @@ struct DecimateAlpha
|
||||
};
|
||||
|
||||
|
||||
namespace inter_area {
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
inline void saturate_store(const float* src, uchar* dst) {
|
||||
const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
|
||||
const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
|
||||
const v_int32 tmp2 = v_round(vx_load(src + 2 * VTraits<v_float32>::vlanes()));
|
||||
const v_int32 tmp3 = v_round(vx_load(src + 3 * VTraits<v_float32>::vlanes()));
|
||||
v_store(dst, v_pack(v_pack_u(tmp0, tmp1), v_pack_u(tmp2, tmp3)));
|
||||
}
|
||||
|
||||
inline void saturate_store(const float* src, ushort* dst) {
|
||||
const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
|
||||
const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
|
||||
v_store(dst, v_pack_u(tmp0, tmp1));
|
||||
}
|
||||
|
||||
inline void saturate_store(const float* src, short* dst) {
|
||||
const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
|
||||
const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
|
||||
v_store(dst, v_pack(tmp0, tmp1));
|
||||
}
|
||||
|
||||
static inline v_float32 vx_setall(float coeff) { return vx_setall_f32(coeff); }
|
||||
|
||||
template <typename T>
|
||||
struct VArea {};
|
||||
|
||||
template <>
|
||||
struct VArea<float> {
|
||||
typedef v_float32 vWT;
|
||||
};
|
||||
#endif
|
||||
|
||||
#if (CV_SIMD128_64F || CV_SIMD_SCALABLE_64F)
|
||||
static inline v_float64 vx_setall(double coeff) { return vx_setall_f64(coeff); }
|
||||
|
||||
template <>
|
||||
struct VArea<double> {
|
||||
typedef v_float64 vWT;
|
||||
};
|
||||
|
||||
#else
|
||||
inline void mul(const double* buf, int width, double beta, double* sum) {
|
||||
for (int dx = 0; dx < width; ++dx) {
|
||||
sum[dx] = beta * buf[dx];
|
||||
}
|
||||
}
|
||||
|
||||
inline void muladd(const double* buf, int width, double beta, double* sum) {
|
||||
for (int dx = 0; dx < width; ++dx) {
|
||||
sum[dx] += beta * buf[dx];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T, typename WT>
|
||||
inline void saturate_store(const WT* sum, int width, T* D) {
|
||||
int dx = 0;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int step = VTraits<typename VArea<WT>::vWT>::vlanes() * sizeof(WT) / sizeof(T);
|
||||
for (; dx + step < width; dx += step) {
|
||||
saturate_store(sum + dx, D + dx);
|
||||
}
|
||||
#endif
|
||||
for (; dx < width; ++dx) {
|
||||
D[dx] = saturate_cast<T>(sum[dx]);
|
||||
}
|
||||
}
|
||||
|
||||
// Optimization when T == WT.
|
||||
template <typename WT>
|
||||
inline void saturate_store(const WT* sum, int width, WT* D) {
|
||||
std::copy(sum, sum + width, D);
|
||||
}
|
||||
|
||||
template <typename WT>
|
||||
inline void mul(const WT* buf, int width, WT beta, WT* sum) {
|
||||
int dx = 0;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int step = VTraits<typename VArea<WT>::vWT>::vlanes();
|
||||
for (; dx + step < width; dx += step) {
|
||||
vx_store(sum + dx, v_mul(vx_setall(beta), vx_load(buf + dx)));
|
||||
}
|
||||
#endif
|
||||
for (; dx < width; ++dx) {
|
||||
sum[dx] = beta * buf[dx];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename WT>
|
||||
inline void muladd(const WT* buf, int width, WT beta, WT* sum) {
|
||||
int dx = 0;
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE)
|
||||
const int step = VTraits<typename VArea<WT>::vWT>::vlanes();
|
||||
for (; dx + step < width; dx += step) {
|
||||
vx_store(sum + dx, v_add(vx_load(sum + dx), v_mul(vx_setall(beta), vx_load(buf + dx))));
|
||||
}
|
||||
#endif
|
||||
for (; dx < width; ++dx) {
|
||||
sum[dx] += beta * buf[dx];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace inter_area
|
||||
|
||||
template<typename T, typename WT> class ResizeArea_Invoker :
|
||||
public ParallelLoopBody
|
||||
{
|
||||
@ -3120,27 +3225,17 @@ public:
|
||||
|
||||
if( dy != prev_dy )
|
||||
{
|
||||
T* D = dst->template ptr<T>(prev_dy);
|
||||
|
||||
for( dx = 0; dx < dsize.width; dx++ )
|
||||
{
|
||||
D[dx] = saturate_cast<T>(sum[dx]);
|
||||
sum[dx] = beta*buf[dx];
|
||||
}
|
||||
inter_area::saturate_store(sum, dsize.width, dst->template ptr<T>(prev_dy));
|
||||
inter_area::mul(buf, dsize.width, beta, sum);
|
||||
prev_dy = dy;
|
||||
}
|
||||
else
|
||||
{
|
||||
for( dx = 0; dx < dsize.width; dx++ )
|
||||
sum[dx] += beta*buf[dx];
|
||||
inter_area::muladd(buf, dsize.width, beta, sum);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
T* D = dst->template ptr<T>(prev_dy);
|
||||
for( dx = 0; dx < dsize.width; dx++ )
|
||||
D[dx] = saturate_cast<T>(sum[dx]);
|
||||
}
|
||||
inter_area::saturate_store(sum, dsize.width, dst->template ptr<T>(prev_dy));
|
||||
}
|
||||
|
||||
private:
|
||||
|
Loading…
Reference in New Issue
Block a user