mirror of
https://github.com/opencv/opencv.git
synced 2025-07-20 19:17:36 +08:00
Merge pull request #26958 from amane-ame:pyramids_hal_rvv
Add RISC-V HAL implementation for cv::pyrDown and cv::pyrUp #26958 This patch implements `cv_hal_pyrdown/cv_hal_pyrup` function in RVV_HAL using native intrinsics, optimizing the performance for `cv::pyrDown`, `cv::pyrUp` and `cv::buildPyramids` with data types `{8U,16S,32F} x {C1,C2,C3,C4,Cn}`. Tested on MUSE-PI (Spacemit X60) for both gcc 14.2 and clang 20.0. ``` $ ./opencv_test_imgproc --gtest_filter="*pyr*:*Pyr*" $ ./opencv_perf_imgproc --gtest_filter="*pyr*:*Pyr*" --perf_min_samples=300 --perf_force_samples=300 ``` <img width="1112" alt="Untitled" src="https://github.com/user-attachments/assets/235a9fba-0d29-434e-8a10-498212bac657" /> ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
5c6c6af4ec
commit
cbcfd772ce
2
3rdparty/hal_rvv/hal_rvv.hpp
vendored
2
3rdparty/hal_rvv/hal_rvv.hpp
vendored
@ -30,6 +30,8 @@
|
|||||||
#include "hal_rvv_1p0/atan.hpp" // core
|
#include "hal_rvv_1p0/atan.hpp" // core
|
||||||
#include "hal_rvv_1p0/split.hpp" // core
|
#include "hal_rvv_1p0/split.hpp" // core
|
||||||
#include "hal_rvv_1p0/flip.hpp" // core
|
#include "hal_rvv_1p0/flip.hpp" // core
|
||||||
|
|
||||||
|
#include "hal_rvv_1p0/pyramids.hpp" // imgproc
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
719
3rdparty/hal_rvv/hal_rvv_1p0/pyramids.hpp
vendored
Normal file
719
3rdparty/hal_rvv/hal_rvv_1p0/pyramids.hpp
vendored
Normal file
@ -0,0 +1,719 @@
|
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
#ifndef OPENCV_HAL_RVV_PYRAMIDS_HPP_INCLUDED
|
||||||
|
#define OPENCV_HAL_RVV_PYRAMIDS_HPP_INCLUDED
|
||||||
|
|
||||||
|
#include <riscv_vector.h>
|
||||||
|
|
||||||
|
namespace cv { namespace cv_hal_rvv { namespace pyramids {
|
||||||
|
|
||||||
|
#undef cv_hal_pyrdown
|
||||||
|
#define cv_hal_pyrdown cv::cv_hal_rvv::pyramids::pyrDown
|
||||||
|
#undef cv_hal_pyrup
|
||||||
|
#define cv_hal_pyrup cv::cv_hal_rvv::pyramids::pyrUp
|
||||||
|
|
||||||
|
template<typename T> struct rvv;
|
||||||
|
|
||||||
|
template<> struct rvv<uchar>
|
||||||
|
{
|
||||||
|
static inline size_t vsetvl_WT(size_t a) { return __riscv_vsetvl_e32m4(a); }
|
||||||
|
static inline vuint8m1_t vle_T(const uchar* a, size_t b) { return __riscv_vle8_v_u8m1(a, b); }
|
||||||
|
static inline vint32m4_t vle_WT(const int* a, size_t b) { return __riscv_vle32_v_i32m4(a, b); }
|
||||||
|
static inline vuint32m4_t vle_M(const uint* a, size_t b) { return __riscv_vle32_v_u32m4(a, b); }
|
||||||
|
static inline vuint8m1_t vlse_T(const uchar* a, ptrdiff_t b, size_t c) { return __riscv_vlse8_v_u8m1(a, b, c); }
|
||||||
|
static inline vuint8m1_t vloxei_T(const uchar* a, vuint32m4_t b, size_t c) { return __riscv_vloxei32_v_u8m1(a, b, c); }
|
||||||
|
static inline void vse_T(uchar* a, vuint8m1_t b, size_t c) { return __riscv_vse8(a, b, c); }
|
||||||
|
static inline vint32m4_t vcvt_T_WT(vuint8m1_t a, size_t b) { return __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vzext_vf4(a, b)); }
|
||||||
|
static inline vuint8m1_t vcvt_WT_T(vint32m4_t a, int b, size_t c) { return __riscv_vncvt_x(__riscv_vncvt_x(__riscv_vreinterpret_v_i32m4_u32m4(__riscv_vsra(__riscv_vadd(a, 1 << (b - 1), c), b, c)), c), c); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct rvv<short>
|
||||||
|
{
|
||||||
|
static inline size_t vsetvl_WT(size_t a) { return __riscv_vsetvl_e32m4(a); }
|
||||||
|
static inline vint16m2_t vle_T(const short* a, size_t b) { return __riscv_vle16_v_i16m2(a, b); }
|
||||||
|
static inline vint32m4_t vle_WT(const int* a, size_t b) { return __riscv_vle32_v_i32m4(a, b); }
|
||||||
|
static inline vuint32m4_t vle_M(const uint* a, size_t b) { return __riscv_vle32_v_u32m4(a, b); }
|
||||||
|
static inline vint16m2_t vlse_T(const short* a, ptrdiff_t b, size_t c) { return __riscv_vlse16_v_i16m2(a, b, c); }
|
||||||
|
static inline vint16m2_t vloxei_T(const short* a, vuint32m4_t b, size_t c) { return __riscv_vloxei32_v_i16m2(a, b, c); }
|
||||||
|
static inline void vse_T(short* a, vint16m2_t b, size_t c) { return __riscv_vse16(a, b, c); }
|
||||||
|
static inline vint32m4_t vcvt_T_WT(vint16m2_t a, size_t b) { return __riscv_vsext_vf2(a, b); }
|
||||||
|
static inline vint16m2_t vcvt_WT_T(vint32m4_t a, int b, size_t c) { return __riscv_vncvt_x(__riscv_vsra(__riscv_vadd(a, 1 << (b - 1), c), b, c), c); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct rvv<float>
|
||||||
|
{
|
||||||
|
static inline size_t vsetvl_WT(size_t a) { return __riscv_vsetvl_e32m4(a); }
|
||||||
|
static inline vfloat32m4_t vle_T(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); }
|
||||||
|
static inline vfloat32m4_t vle_WT(const float* a, size_t b) { return __riscv_vle32_v_f32m4(a, b); }
|
||||||
|
static inline vuint32m4_t vle_M(const uint* a, size_t b) { return __riscv_vle32_v_u32m4(a, b); }
|
||||||
|
static inline vfloat32m4_t vlse_T(const float* a, ptrdiff_t b, size_t c) { return __riscv_vlse32_v_f32m4(a, b, c); }
|
||||||
|
static inline vfloat32m4_t vloxei_T(const float* a, vuint32m4_t b, size_t c) { return __riscv_vloxei32_v_f32m4(a, b, c); }
|
||||||
|
static inline void vse_T(float* a, vfloat32m4_t b, size_t c) { return __riscv_vse32(a, b, c); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T, typename WT> struct pyrDownVec0
|
||||||
|
{
|
||||||
|
void operator()(const T* src, WT* row, const uint* tabM, int start, int end)
|
||||||
|
{
|
||||||
|
int vl;
|
||||||
|
switch (start)
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
for( int x = start; x < end; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<T>::vsetvl_WT(end - x);
|
||||||
|
auto vec_src0 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + x * 2 - 2, 2 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src1 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + x * 2 - 1, 2 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src2 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + x * 2, 2 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src3 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + x * 2 + 1, 2 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src4 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + x * 2 + 2, 2 * sizeof(T), vl), vl);
|
||||||
|
__riscv_vse32(row + x, __riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl),
|
||||||
|
__riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl), vl);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
for( int x = start / 2; x < end / 2; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<T>::vsetvl_WT(end / 2 - x);
|
||||||
|
auto vec_src0 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 2) * 2, 4 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src1 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 1) * 2, 4 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src2 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2) * 2, 4 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src3 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 1) * 2, 4 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src4 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 2) * 2, 4 * sizeof(T), vl), vl);
|
||||||
|
__riscv_vsse32(row + x * 2, 2 * sizeof(WT), __riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl),
|
||||||
|
__riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl), vl);
|
||||||
|
vec_src0 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 2) * 2 + 1, 4 * sizeof(T), vl), vl);
|
||||||
|
vec_src1 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 1) * 2 + 1, 4 * sizeof(T), vl), vl);
|
||||||
|
vec_src2 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2) * 2 + 1, 4 * sizeof(T), vl), vl);
|
||||||
|
vec_src3 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 1) * 2 + 1, 4 * sizeof(T), vl), vl);
|
||||||
|
vec_src4 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 2) * 2 + 1, 4 * sizeof(T), vl), vl);
|
||||||
|
__riscv_vsse32(row + x * 2 + 1, 2 * sizeof(WT), __riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl),
|
||||||
|
__riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl), vl);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
for( int x = start / 3; x < end / 3; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<T>::vsetvl_WT(end / 3 - x);
|
||||||
|
auto vec_src0 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 2) * 3, 6 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src1 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 1) * 3, 6 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src2 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2) * 3, 6 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src3 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 1) * 3, 6 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src4 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 2) * 3, 6 * sizeof(T), vl), vl);
|
||||||
|
__riscv_vsse32(row + x * 3, 3 * sizeof(WT), __riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl),
|
||||||
|
__riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl), vl);
|
||||||
|
vec_src0 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 2) * 3 + 1, 6 * sizeof(T), vl), vl);
|
||||||
|
vec_src1 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 1) * 3 + 1, 6 * sizeof(T), vl), vl);
|
||||||
|
vec_src2 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2) * 3 + 1, 6 * sizeof(T), vl), vl);
|
||||||
|
vec_src3 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 1) * 3 + 1, 6 * sizeof(T), vl), vl);
|
||||||
|
vec_src4 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 2) * 3 + 1, 6 * sizeof(T), vl), vl);
|
||||||
|
__riscv_vsse32(row + x * 3 + 1, 3 * sizeof(WT), __riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl),
|
||||||
|
__riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl), vl);
|
||||||
|
vec_src0 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 2) * 3 + 2, 6 * sizeof(T), vl), vl);
|
||||||
|
vec_src1 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 1) * 3 + 2, 6 * sizeof(T), vl), vl);
|
||||||
|
vec_src2 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2) * 3 + 2, 6 * sizeof(T), vl), vl);
|
||||||
|
vec_src3 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 1) * 3 + 2, 6 * sizeof(T), vl), vl);
|
||||||
|
vec_src4 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 2) * 3 + 2, 6 * sizeof(T), vl), vl);
|
||||||
|
__riscv_vsse32(row + x * 3 + 2, 3 * sizeof(WT), __riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl),
|
||||||
|
__riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl), vl);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
for( int x = start / 4; x < end / 4; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<T>::vsetvl_WT(end / 4 - x);
|
||||||
|
auto vec_src0 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 2) * 4, 8 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src1 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 1) * 4, 8 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src2 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2) * 4, 8 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src3 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 1) * 4, 8 * sizeof(T), vl), vl);
|
||||||
|
auto vec_src4 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 2) * 4, 8 * sizeof(T), vl), vl);
|
||||||
|
__riscv_vsse32(row + x * 4, 4 * sizeof(WT), __riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl),
|
||||||
|
__riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl), vl);
|
||||||
|
vec_src0 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 2) * 4 + 1, 8 * sizeof(T), vl), vl);
|
||||||
|
vec_src1 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 1) * 4 + 1, 8 * sizeof(T), vl), vl);
|
||||||
|
vec_src2 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2) * 4 + 1, 8 * sizeof(T), vl), vl);
|
||||||
|
vec_src3 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 1) * 4 + 1, 8 * sizeof(T), vl), vl);
|
||||||
|
vec_src4 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 2) * 4 + 1, 8 * sizeof(T), vl), vl);
|
||||||
|
__riscv_vsse32(row + x * 4 + 1, 4 * sizeof(WT), __riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl),
|
||||||
|
__riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl), vl);
|
||||||
|
vec_src0 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 2) * 4 + 2, 8 * sizeof(T), vl), vl);
|
||||||
|
vec_src1 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 1) * 4 + 2, 8 * sizeof(T), vl), vl);
|
||||||
|
vec_src2 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2) * 4 + 2, 8 * sizeof(T), vl), vl);
|
||||||
|
vec_src3 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 1) * 4 + 2, 8 * sizeof(T), vl), vl);
|
||||||
|
vec_src4 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 2) * 4 + 2, 8 * sizeof(T), vl), vl);
|
||||||
|
__riscv_vsse32(row + x * 4 + 2, 4 * sizeof(WT), __riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl),
|
||||||
|
__riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl), vl);
|
||||||
|
vec_src0 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 2) * 4 + 3, 8 * sizeof(T), vl), vl);
|
||||||
|
vec_src1 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 - 1) * 4 + 3, 8 * sizeof(T), vl), vl);
|
||||||
|
vec_src2 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2) * 4 + 3, 8 * sizeof(T), vl), vl);
|
||||||
|
vec_src3 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 1) * 4 + 3, 8 * sizeof(T), vl), vl);
|
||||||
|
vec_src4 = rvv<T>::vcvt_T_WT(rvv<T>::vlse_T(src + (x * 2 + 2) * 4 + 3, 8 * sizeof(T), vl), vl);
|
||||||
|
__riscv_vsse32(row + x * 4 + 3, 4 * sizeof(WT), __riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl),
|
||||||
|
__riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl), vl);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
for( int x = start; x < end; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<T>::vsetvl_WT(end - x);
|
||||||
|
auto vec_tabM = rvv<T>::vle_M(tabM + x, vl);
|
||||||
|
vec_tabM = __riscv_vmul(__riscv_vsub(vec_tabM, start * 2, vl), sizeof(T), vl);
|
||||||
|
auto vec_src0 = rvv<T>::vcvt_T_WT(rvv<T>::vloxei_T(src, vec_tabM, vl), vl);
|
||||||
|
vec_tabM = __riscv_vadd(vec_tabM, start * sizeof(T), vl);
|
||||||
|
auto vec_src1 = rvv<T>::vcvt_T_WT(rvv<T>::vloxei_T(src, vec_tabM, vl), vl);
|
||||||
|
vec_tabM = __riscv_vadd(vec_tabM, start * sizeof(T), vl);
|
||||||
|
auto vec_src2 = rvv<T>::vcvt_T_WT(rvv<T>::vloxei_T(src, vec_tabM, vl), vl);
|
||||||
|
vec_tabM = __riscv_vadd(vec_tabM, start * sizeof(T), vl);
|
||||||
|
auto vec_src3 = rvv<T>::vcvt_T_WT(rvv<T>::vloxei_T(src, vec_tabM, vl), vl);
|
||||||
|
vec_tabM = __riscv_vadd(vec_tabM, start * sizeof(T), vl);
|
||||||
|
auto vec_src4 = rvv<T>::vcvt_T_WT(rvv<T>::vloxei_T(src, vec_tabM, vl), vl);
|
||||||
|
__riscv_vse32(row + x, __riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl),
|
||||||
|
__riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl), vl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<> struct pyrDownVec0<float, float>
|
||||||
|
{
|
||||||
|
void operator()(const float* src, float* row, const uint* tabM, int start, int end)
|
||||||
|
{
|
||||||
|
int vl;
|
||||||
|
switch (start)
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
for( int x = start; x < end; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<float>::vsetvl_WT(end - x);
|
||||||
|
auto vec_src0 = rvv<float>::vlse_T(src + x * 2 - 2, 2 * sizeof(float), vl);
|
||||||
|
auto vec_src1 = rvv<float>::vlse_T(src + x * 2 - 1, 2 * sizeof(float), vl);
|
||||||
|
auto vec_src2 = rvv<float>::vlse_T(src + x * 2, 2 * sizeof(float), vl);
|
||||||
|
auto vec_src3 = rvv<float>::vlse_T(src + x * 2 + 1, 2 * sizeof(float), vl);
|
||||||
|
auto vec_src4 = rvv<float>::vlse_T(src + x * 2 + 2, 2 * sizeof(float), vl);
|
||||||
|
__riscv_vse32(row + x, __riscv_vfmadd(vec_src2, 6, __riscv_vfmadd(__riscv_vfadd(vec_src1, vec_src3, vl), 4, __riscv_vfadd(vec_src0, vec_src4, vl), vl), vl), vl);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
for( int x = start / 2; x < end / 2; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<float>::vsetvl_WT(end / 2 - x);
|
||||||
|
auto vec_src0 = rvv<float>::vlse_T(src + (x * 2 - 2) * 2, 4 * sizeof(float), vl);
|
||||||
|
auto vec_src1 = rvv<float>::vlse_T(src + (x * 2 - 1) * 2, 4 * sizeof(float), vl);
|
||||||
|
auto vec_src2 = rvv<float>::vlse_T(src + (x * 2) * 2, 4 * sizeof(float), vl);
|
||||||
|
auto vec_src3 = rvv<float>::vlse_T(src + (x * 2 + 1) * 2, 4 * sizeof(float), vl);
|
||||||
|
auto vec_src4 = rvv<float>::vlse_T(src + (x * 2 + 2) * 2, 4 * sizeof(float), vl);
|
||||||
|
__riscv_vsse32(row + x * 2, 2 * sizeof(float), __riscv_vfmadd(vec_src2, 6, __riscv_vfmadd(__riscv_vfadd(vec_src1, vec_src3, vl), 4, __riscv_vfadd(vec_src0, vec_src4, vl), vl), vl), vl);
|
||||||
|
vec_src0 = rvv<float>::vlse_T(src + (x * 2 - 2) * 2 + 1, 4 * sizeof(float), vl);
|
||||||
|
vec_src1 = rvv<float>::vlse_T(src + (x * 2 - 1) * 2 + 1, 4 * sizeof(float), vl);
|
||||||
|
vec_src2 = rvv<float>::vlse_T(src + (x * 2) * 2 + 1, 4 * sizeof(float), vl);
|
||||||
|
vec_src3 = rvv<float>::vlse_T(src + (x * 2 + 1) * 2 + 1, 4 * sizeof(float), vl);
|
||||||
|
vec_src4 = rvv<float>::vlse_T(src + (x * 2 + 2) * 2 + 1, 4 * sizeof(float), vl);
|
||||||
|
__riscv_vsse32(row + x * 2 + 1, 2 * sizeof(float), __riscv_vfmadd(vec_src2, 6, __riscv_vfmadd(__riscv_vfadd(vec_src1, vec_src3, vl), 4, __riscv_vfadd(vec_src0, vec_src4, vl), vl), vl), vl);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
for( int x = start / 3; x < end / 3; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<float>::vsetvl_WT(end / 3 - x);
|
||||||
|
auto vec_src0 = rvv<float>::vlse_T(src + (x * 2 - 2) * 3, 6 * sizeof(float), vl);
|
||||||
|
auto vec_src1 = rvv<float>::vlse_T(src + (x * 2 - 1) * 3, 6 * sizeof(float), vl);
|
||||||
|
auto vec_src2 = rvv<float>::vlse_T(src + (x * 2) * 3, 6 * sizeof(float), vl);
|
||||||
|
auto vec_src3 = rvv<float>::vlse_T(src + (x * 2 + 1) * 3, 6 * sizeof(float), vl);
|
||||||
|
auto vec_src4 = rvv<float>::vlse_T(src + (x * 2 + 2) * 3, 6 * sizeof(float), vl);
|
||||||
|
__riscv_vsse32(row + x * 3, 3 * sizeof(float), __riscv_vfmadd(vec_src2, 6, __riscv_vfmadd(__riscv_vfadd(vec_src1, vec_src3, vl), 4, __riscv_vfadd(vec_src0, vec_src4, vl), vl), vl), vl);
|
||||||
|
vec_src0 = rvv<float>::vlse_T(src + (x * 2 - 2) * 3 + 1, 6 * sizeof(float), vl);
|
||||||
|
vec_src1 = rvv<float>::vlse_T(src + (x * 2 - 1) * 3 + 1, 6 * sizeof(float), vl);
|
||||||
|
vec_src2 = rvv<float>::vlse_T(src + (x * 2) * 3 + 1, 6 * sizeof(float), vl);
|
||||||
|
vec_src3 = rvv<float>::vlse_T(src + (x * 2 + 1) * 3 + 1, 6 * sizeof(float), vl);
|
||||||
|
vec_src4 = rvv<float>::vlse_T(src + (x * 2 + 2) * 3 + 1, 6 * sizeof(float), vl);
|
||||||
|
__riscv_vsse32(row + x * 3 + 1, 3 * sizeof(float), __riscv_vfmadd(vec_src2, 6, __riscv_vfmadd(__riscv_vfadd(vec_src1, vec_src3, vl), 4, __riscv_vfadd(vec_src0, vec_src4, vl), vl), vl), vl);
|
||||||
|
vec_src0 = rvv<float>::vlse_T(src + (x * 2 - 2) * 3 + 2, 6 * sizeof(float), vl);
|
||||||
|
vec_src1 = rvv<float>::vlse_T(src + (x * 2 - 1) * 3 + 2, 6 * sizeof(float), vl);
|
||||||
|
vec_src2 = rvv<float>::vlse_T(src + (x * 2) * 3 + 2, 6 * sizeof(float), vl);
|
||||||
|
vec_src3 = rvv<float>::vlse_T(src + (x * 2 + 1) * 3 + 2, 6 * sizeof(float), vl);
|
||||||
|
vec_src4 = rvv<float>::vlse_T(src + (x * 2 + 2) * 3 + 2, 6 * sizeof(float), vl);
|
||||||
|
__riscv_vsse32(row + x * 3 + 2, 3 * sizeof(float), __riscv_vfmadd(vec_src2, 6, __riscv_vfmadd(__riscv_vfadd(vec_src1, vec_src3, vl), 4, __riscv_vfadd(vec_src0, vec_src4, vl), vl), vl), vl);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
for( int x = start / 4; x < end / 4; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<float>::vsetvl_WT(end / 4 - x);
|
||||||
|
auto vec_src0 = rvv<float>::vlse_T(src + (x * 2 - 2) * 4, 8 * sizeof(float), vl);
|
||||||
|
auto vec_src1 = rvv<float>::vlse_T(src + (x * 2 - 1) * 4, 8 * sizeof(float), vl);
|
||||||
|
auto vec_src2 = rvv<float>::vlse_T(src + (x * 2) * 4, 8 * sizeof(float), vl);
|
||||||
|
auto vec_src3 = rvv<float>::vlse_T(src + (x * 2 + 1) * 4, 8 * sizeof(float), vl);
|
||||||
|
auto vec_src4 = rvv<float>::vlse_T(src + (x * 2 + 2) * 4, 8 * sizeof(float), vl);
|
||||||
|
__riscv_vsse32(row + x * 4, 4 * sizeof(float), __riscv_vfmadd(vec_src2, 6, __riscv_vfmadd(__riscv_vfadd(vec_src1, vec_src3, vl), 4, __riscv_vfadd(vec_src0, vec_src4, vl), vl), vl), vl);
|
||||||
|
vec_src0 = rvv<float>::vlse_T(src + (x * 2 - 2) * 4 + 1, 8 * sizeof(float), vl);
|
||||||
|
vec_src1 = rvv<float>::vlse_T(src + (x * 2 - 1) * 4 + 1, 8 * sizeof(float), vl);
|
||||||
|
vec_src2 = rvv<float>::vlse_T(src + (x * 2) * 4 + 1, 8 * sizeof(float), vl);
|
||||||
|
vec_src3 = rvv<float>::vlse_T(src + (x * 2 + 1) * 4 + 1, 8 * sizeof(float), vl);
|
||||||
|
vec_src4 = rvv<float>::vlse_T(src + (x * 2 + 2) * 4 + 1, 8 * sizeof(float), vl);
|
||||||
|
__riscv_vsse32(row + x * 4 + 1, 4 * sizeof(float), __riscv_vfmadd(vec_src2, 6, __riscv_vfmadd(__riscv_vfadd(vec_src1, vec_src3, vl), 4, __riscv_vfadd(vec_src0, vec_src4, vl), vl), vl), vl);
|
||||||
|
vec_src0 = rvv<float>::vlse_T(src + (x * 2 - 2) * 4 + 2, 8 * sizeof(float), vl);
|
||||||
|
vec_src1 = rvv<float>::vlse_T(src + (x * 2 - 1) * 4 + 2, 8 * sizeof(float), vl);
|
||||||
|
vec_src2 = rvv<float>::vlse_T(src + (x * 2) * 4 + 2, 8 * sizeof(float), vl);
|
||||||
|
vec_src3 = rvv<float>::vlse_T(src + (x * 2 + 1) * 4 + 2, 8 * sizeof(float), vl);
|
||||||
|
vec_src4 = rvv<float>::vlse_T(src + (x * 2 + 2) * 4 + 2, 8 * sizeof(float), vl);
|
||||||
|
__riscv_vsse32(row + x * 4 + 2, 4 * sizeof(float), __riscv_vfmadd(vec_src2, 6, __riscv_vfmadd(__riscv_vfadd(vec_src1, vec_src3, vl), 4, __riscv_vfadd(vec_src0, vec_src4, vl), vl), vl), vl);
|
||||||
|
vec_src0 = rvv<float>::vlse_T(src + (x * 2 - 2) * 4 + 3, 8 * sizeof(float), vl);
|
||||||
|
vec_src1 = rvv<float>::vlse_T(src + (x * 2 - 1) * 4 + 3, 8 * sizeof(float), vl);
|
||||||
|
vec_src2 = rvv<float>::vlse_T(src + (x * 2) * 4 + 3, 8 * sizeof(float), vl);
|
||||||
|
vec_src3 = rvv<float>::vlse_T(src + (x * 2 + 1) * 4 + 3, 8 * sizeof(float), vl);
|
||||||
|
vec_src4 = rvv<float>::vlse_T(src + (x * 2 + 2) * 4 + 3, 8 * sizeof(float), vl);
|
||||||
|
__riscv_vsse32(row + x * 4 + 3, 4 * sizeof(float), __riscv_vfmadd(vec_src2, 6, __riscv_vfmadd(__riscv_vfadd(vec_src1, vec_src3, vl), 4, __riscv_vfadd(vec_src0, vec_src4, vl), vl), vl), vl);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
for( int x = start; x < end; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<float>::vsetvl_WT(end - x);
|
||||||
|
auto vec_tabM = rvv<float>::vle_M(tabM + x, vl);
|
||||||
|
vec_tabM = __riscv_vmul(__riscv_vsub(vec_tabM, start * 2, vl), sizeof(float), vl);
|
||||||
|
auto vec_src0 = rvv<float>::vloxei_T(src, vec_tabM, vl);
|
||||||
|
vec_tabM = __riscv_vadd(vec_tabM, start * sizeof(float), vl);
|
||||||
|
auto vec_src1 = rvv<float>::vloxei_T(src, vec_tabM, vl);
|
||||||
|
vec_tabM = __riscv_vadd(vec_tabM, start * sizeof(float), vl);
|
||||||
|
auto vec_src2 = rvv<float>::vloxei_T(src, vec_tabM, vl);
|
||||||
|
vec_tabM = __riscv_vadd(vec_tabM, start * sizeof(float), vl);
|
||||||
|
auto vec_src3 = rvv<float>::vloxei_T(src, vec_tabM, vl);
|
||||||
|
vec_tabM = __riscv_vadd(vec_tabM, start * sizeof(float), vl);
|
||||||
|
auto vec_src4 = rvv<float>::vloxei_T(src, vec_tabM, vl);
|
||||||
|
__riscv_vse32(row + x, __riscv_vfmadd(__riscv_vfadd(__riscv_vfadd(vec_src1, vec_src2, vl), vec_src3, vl), 4,
|
||||||
|
__riscv_vfadd(__riscv_vfadd(vec_src0, vec_src4, vl), __riscv_vfadd(vec_src2, vec_src2, vl), vl), vl), vl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T, typename WT> struct pyrDownVec1
|
||||||
|
{
|
||||||
|
void operator()(WT* row0, WT* row1, WT* row2, WT* row3, WT* row4, T* dst, int end)
|
||||||
|
{
|
||||||
|
int vl;
|
||||||
|
for( int x = 0 ; x < end; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<T>::vsetvl_WT(end - x);
|
||||||
|
auto vec_src0 = rvv<T>::vle_WT(row0 + x, vl);
|
||||||
|
auto vec_src1 = rvv<T>::vle_WT(row1 + x, vl);
|
||||||
|
auto vec_src2 = rvv<T>::vle_WT(row2 + x, vl);
|
||||||
|
auto vec_src3 = rvv<T>::vle_WT(row3 + x, vl);
|
||||||
|
auto vec_src4 = rvv<T>::vle_WT(row4 + x, vl);
|
||||||
|
rvv<T>::vse_T(dst + x, rvv<T>::vcvt_WT_T(__riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl),
|
||||||
|
__riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl), 8, vl), vl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<> struct pyrDownVec1<float, float>
|
||||||
|
{
|
||||||
|
void operator()(float* row0, float* row1, float* row2, float* row3, float* row4, float* dst, int end)
|
||||||
|
{
|
||||||
|
int vl;
|
||||||
|
for( int x = 0 ; x < end; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<float>::vsetvl_WT(end - x);
|
||||||
|
auto vec_src0 = rvv<float>::vle_WT(row0 + x, vl);
|
||||||
|
auto vec_src1 = rvv<float>::vle_WT(row1 + x, vl);
|
||||||
|
auto vec_src2 = rvv<float>::vle_WT(row2 + x, vl);
|
||||||
|
auto vec_src3 = rvv<float>::vle_WT(row3 + x, vl);
|
||||||
|
auto vec_src4 = rvv<float>::vle_WT(row4 + x, vl);
|
||||||
|
rvv<float>::vse_T(dst + x, __riscv_vfmul(__riscv_vfmadd(vec_src2, 6, __riscv_vfmadd(__riscv_vfadd(vec_src1, vec_src3, vl), 4, __riscv_vfadd(vec_src0, vec_src4, vl), vl), vl), 1.f / 256.f, vl), vl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T, typename WT> struct pyrUpVec0
|
||||||
|
{
|
||||||
|
void operator()(const T* src, WT* row, const uint* dtab, int start, int end)
|
||||||
|
{
|
||||||
|
int vl;
|
||||||
|
for( int x = start; x < end; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<T>::vsetvl_WT(end - x);
|
||||||
|
auto vec_src0 = rvv<T>::vcvt_T_WT(rvv<T>::vle_T(src + x - start, vl), vl);
|
||||||
|
auto vec_src1 = rvv<T>::vcvt_T_WT(rvv<T>::vle_T(src + x, vl), vl);
|
||||||
|
auto vec_src2 = rvv<T>::vcvt_T_WT(rvv<T>::vle_T(src + x + start, vl), vl);
|
||||||
|
|
||||||
|
auto vec_dtab = rvv<T>::vle_M(dtab + x, vl);
|
||||||
|
vec_dtab = __riscv_vmul(vec_dtab, sizeof(WT), vl);
|
||||||
|
__riscv_vsoxei32(row, vec_dtab, __riscv_vadd(__riscv_vadd(vec_src0, vec_src2, vl), __riscv_vadd(__riscv_vsll(vec_src1, 2, vl), __riscv_vsll(vec_src1, 1, vl), vl), vl), vl);
|
||||||
|
__riscv_vsoxei32(row, __riscv_vadd(vec_dtab, start * sizeof(WT), vl), __riscv_vsll(__riscv_vadd(vec_src1, vec_src2, vl), 2, vl), vl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<> struct pyrUpVec0<float, float>
|
||||||
|
{
|
||||||
|
void operator()(const float* src, float* row, const uint* dtab, int start, int end)
|
||||||
|
{
|
||||||
|
int vl;
|
||||||
|
for( int x = start; x < end; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<float>::vsetvl_WT(end - x);
|
||||||
|
auto vec_src0 = rvv<float>::vle_T(src + x - start, vl);
|
||||||
|
auto vec_src1 = rvv<float>::vle_T(src + x, vl);
|
||||||
|
auto vec_src2 = rvv<float>::vle_T(src + x + start, vl);
|
||||||
|
|
||||||
|
auto vec_dtab = rvv<float>::vle_M(dtab + x, vl);
|
||||||
|
vec_dtab = __riscv_vmul(vec_dtab, sizeof(float), vl);
|
||||||
|
__riscv_vsoxei32(row, vec_dtab, __riscv_vfadd(__riscv_vfmadd(vec_src1, 6, vec_src0, vl), vec_src2, vl), vl);
|
||||||
|
__riscv_vsoxei32(row, __riscv_vadd(vec_dtab, start * sizeof(float), vl), __riscv_vfmul(__riscv_vfadd(vec_src1, vec_src2, vl), 4, vl), vl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T, typename WT> struct pyrUpVec1
|
||||||
|
{
|
||||||
|
void operator()(WT* row0, WT* row1, WT* row2, T* dst0, T* dst1, int end)
|
||||||
|
{
|
||||||
|
int vl;
|
||||||
|
if (dst0 != dst1)
|
||||||
|
{
|
||||||
|
for( int x = 0 ; x < end; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<T>::vsetvl_WT(end - x);
|
||||||
|
auto vec_src0 = rvv<T>::vle_WT(row0 + x, vl);
|
||||||
|
auto vec_src1 = rvv<T>::vle_WT(row1 + x, vl);
|
||||||
|
auto vec_src2 = rvv<T>::vle_WT(row2 + x, vl);
|
||||||
|
rvv<T>::vse_T(dst0 + x, rvv<T>::vcvt_WT_T(__riscv_vadd(__riscv_vadd(vec_src0, vec_src2, vl), __riscv_vadd(__riscv_vsll(vec_src1, 2, vl), __riscv_vsll(vec_src1, 1, vl), vl), vl), 6, vl), vl);
|
||||||
|
rvv<T>::vse_T(dst1 + x, rvv<T>::vcvt_WT_T(__riscv_vsll(__riscv_vadd(vec_src1, vec_src2, vl), 2, vl), 6, vl), vl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for( int x = 0 ; x < end; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<T>::vsetvl_WT(end - x);
|
||||||
|
auto vec_src0 = rvv<T>::vle_WT(row0 + x, vl);
|
||||||
|
auto vec_src1 = rvv<T>::vle_WT(row1 + x, vl);
|
||||||
|
auto vec_src2 = rvv<T>::vle_WT(row2 + x, vl);
|
||||||
|
rvv<T>::vse_T(dst0 + x, rvv<T>::vcvt_WT_T(__riscv_vadd(__riscv_vadd(vec_src0, vec_src2, vl), __riscv_vadd(__riscv_vsll(vec_src1, 2, vl), __riscv_vsll(vec_src1, 1, vl), vl), vl), 6, vl), vl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<> struct pyrUpVec1<float, float>
|
||||||
|
{
|
||||||
|
void operator()(float* row0, float* row1, float* row2, float* dst0, float* dst1, int end)
|
||||||
|
{
|
||||||
|
int vl;
|
||||||
|
if (dst0 != dst1)
|
||||||
|
{
|
||||||
|
for( int x = 0 ; x < end; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<float>::vsetvl_WT(end - x);
|
||||||
|
auto vec_src0 = rvv<float>::vle_WT(row0 + x, vl);
|
||||||
|
auto vec_src1 = rvv<float>::vle_WT(row1 + x, vl);
|
||||||
|
auto vec_src2 = rvv<float>::vle_WT(row2 + x, vl);
|
||||||
|
rvv<float>::vse_T(dst0 + x, __riscv_vfmul(__riscv_vfadd(__riscv_vfmadd(vec_src1, 6, vec_src0, vl), vec_src2, vl), 1.f / 64.f, vl), vl);
|
||||||
|
rvv<float>::vse_T(dst1 + x, __riscv_vfmul(__riscv_vfadd(vec_src1, vec_src2, vl), 1.f / 16.f, vl), vl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for( int x = 0 ; x < end; x += vl )
|
||||||
|
{
|
||||||
|
vl = rvv<float>::vsetvl_WT(end - x);
|
||||||
|
auto vec_src0 = rvv<float>::vle_WT(row0 + x, vl);
|
||||||
|
auto vec_src1 = rvv<float>::vle_WT(row1 + x, vl);
|
||||||
|
auto vec_src2 = rvv<float>::vle_WT(row2 + x, vl);
|
||||||
|
rvv<float>::vse_T(dst0 + x, __riscv_vfmul(__riscv_vfadd(__riscv_vfmadd(vec_src1, 6, vec_src0, vl), vec_src2, vl), 1.f / 64.f, vl), vl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T, typename WT>
|
||||||
|
struct PyrDownInvoker : ParallelLoopBody
|
||||||
|
{
|
||||||
|
PyrDownInvoker(const uchar* _src_data, size_t _src_step, int _src_width, int _src_height, uchar* _dst_data, size_t _dst_step, int _dst_width, int _dst_height, int _cn, int _borderType, int* _tabR, int* _tabM, int* _tabL)
|
||||||
|
{
|
||||||
|
src_data = _src_data;
|
||||||
|
src_step = _src_step;
|
||||||
|
src_width = _src_width;
|
||||||
|
src_height = _src_height;
|
||||||
|
dst_data = _dst_data;
|
||||||
|
dst_step = _dst_step;
|
||||||
|
dst_width = _dst_width;
|
||||||
|
dst_height = _dst_height;
|
||||||
|
cn = _cn;
|
||||||
|
borderType = _borderType;
|
||||||
|
tabR = _tabR;
|
||||||
|
tabM = _tabM;
|
||||||
|
tabL = _tabL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void operator()(const Range& range) const CV_OVERRIDE;
|
||||||
|
|
||||||
|
const uchar* src_data;
|
||||||
|
size_t src_step;
|
||||||
|
int src_width;
|
||||||
|
int src_height;
|
||||||
|
uchar* dst_data;
|
||||||
|
size_t dst_step;
|
||||||
|
int dst_width;
|
||||||
|
int dst_height;
|
||||||
|
int cn;
|
||||||
|
int borderType;
|
||||||
|
int* tabR;
|
||||||
|
int* tabM;
|
||||||
|
int* tabL;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline int borderInterpolate( int p, int len, int borderType )
|
||||||
|
{
|
||||||
|
if( (unsigned)p < (unsigned)len )
|
||||||
|
;
|
||||||
|
else if( borderType == BORDER_REPLICATE )
|
||||||
|
p = p < 0 ? 0 : len - 1;
|
||||||
|
else if( borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101 )
|
||||||
|
{
|
||||||
|
int delta = borderType == BORDER_REFLECT_101;
|
||||||
|
if( len == 1 )
|
||||||
|
return 0;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
if( p < 0 )
|
||||||
|
p = -p - 1 + delta;
|
||||||
|
else
|
||||||
|
p = len - 1 - (p - len) - delta;
|
||||||
|
}
|
||||||
|
while( (unsigned)p >= (unsigned)len );
|
||||||
|
}
|
||||||
|
else if( borderType == BORDER_WRAP )
|
||||||
|
{
|
||||||
|
if( p < 0 )
|
||||||
|
p -= ((p-len+1)/len)*len;
|
||||||
|
if( p >= len )
|
||||||
|
p %= len;
|
||||||
|
}
|
||||||
|
else if( borderType == BORDER_CONSTANT )
|
||||||
|
p = -1;
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
// the algorithm is copied from imgproc/src/pyramids.cpp,
|
||||||
|
// in the function template void cv::pyrDown_
|
||||||
|
template<typename T, typename WT>
|
||||||
|
inline int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int cn, int borderType)
|
||||||
|
{
|
||||||
|
const int PD_SZ = 5;
|
||||||
|
|
||||||
|
std::vector<int> _tabM(dst_width * cn), _tabL(cn * (PD_SZ + 2)), _tabR(cn * (PD_SZ + 2));
|
||||||
|
int *tabM = _tabM.data(), *tabL = _tabL.data(), *tabR = _tabR.data();
|
||||||
|
|
||||||
|
if( src_width <= 0 || src_height <= 0 ||
|
||||||
|
std::abs(dst_width*2 - src_width) > 2 ||
|
||||||
|
std::abs(dst_height*2 - src_height) > 2 )
|
||||||
|
{
|
||||||
|
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||||
|
}
|
||||||
|
int width0 = std::min((src_width-PD_SZ/2-1)/2 + 1, dst_width);
|
||||||
|
|
||||||
|
for (int x = 0; x <= PD_SZ+1; x++)
|
||||||
|
{
|
||||||
|
int sx0 = borderInterpolate(x - PD_SZ/2, src_width, borderType)*cn;
|
||||||
|
int sx1 = borderInterpolate(x + width0*2 - PD_SZ/2, src_width, borderType)*cn;
|
||||||
|
for (int k = 0; k < cn; k++)
|
||||||
|
{
|
||||||
|
tabL[x*cn + k] = sx0 + k;
|
||||||
|
tabR[x*cn + k] = sx1 + k;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int x = 0; x < dst_width*cn; x++)
|
||||||
|
tabM[x] = (x/cn)*2*cn + x % cn;
|
||||||
|
|
||||||
|
cv::parallel_for_(Range(0,dst_height), PyrDownInvoker<T, WT>(src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, cn, borderType, tabR, tabM, tabL), cv::getNumThreads());
|
||||||
|
return CV_HAL_ERROR_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T, typename WT>
|
||||||
|
void PyrDownInvoker<T, WT>::operator()(const Range& range) const
|
||||||
|
{
|
||||||
|
const int PD_SZ = 5;
|
||||||
|
|
||||||
|
int bufstep = (dst_width*cn + 15) & -16;
|
||||||
|
std::vector<WT> _buf(bufstep*PD_SZ + 16);
|
||||||
|
WT* buf = (WT*)(((size_t)_buf.data() + 15) & -16);
|
||||||
|
WT* rows[PD_SZ];
|
||||||
|
|
||||||
|
int sy0 = -PD_SZ/2, sy = range.start * 2 + sy0, width0 = std::min((src_width-PD_SZ/2-1)/2 + 1, dst_width);
|
||||||
|
|
||||||
|
int _dst_width = dst_width * cn;
|
||||||
|
width0 *= cn;
|
||||||
|
|
||||||
|
for (int y = range.start; y < range.end; y++)
|
||||||
|
{
|
||||||
|
T* dst = reinterpret_cast<T*>(dst_data + dst_step * y);
|
||||||
|
WT *row0, *row1, *row2, *row3, *row4;
|
||||||
|
|
||||||
|
// fill the ring buffer (horizontal convolution and decimation)
|
||||||
|
int sy_limit = y*2 + 2;
|
||||||
|
for( ; sy <= sy_limit; sy++ )
|
||||||
|
{
|
||||||
|
WT* row = buf + ((sy - sy0) % PD_SZ)*bufstep;
|
||||||
|
int _sy = borderInterpolate(sy, src_height, borderType);
|
||||||
|
const T* src = reinterpret_cast<const T*>(src_data + src_step * _sy);
|
||||||
|
|
||||||
|
do {
|
||||||
|
int x = 0;
|
||||||
|
for( ; x < cn; x++ )
|
||||||
|
{
|
||||||
|
row[x] = src[tabL[x+cn*2]]*6 + (src[tabL[x+cn]] + src[tabL[x+cn*3]])*4 +
|
||||||
|
src[tabL[x]] + src[tabL[x+cn*4]];
|
||||||
|
}
|
||||||
|
|
||||||
|
if( x == _dst_width )
|
||||||
|
break;
|
||||||
|
|
||||||
|
pyrDownVec0<T, WT>()(src, row, reinterpret_cast<const uint*>(tabM), cn, width0);
|
||||||
|
x = width0;
|
||||||
|
|
||||||
|
// tabR
|
||||||
|
for (int x_ = 0; x < _dst_width; x++, x_++)
|
||||||
|
{
|
||||||
|
row[x] = src[tabR[x_+cn*2]]*6 + (src[tabR[x_+cn]] + src[tabR[x_+cn*3]])*4 +
|
||||||
|
src[tabR[x_]] + src[tabR[x_+cn*4]];
|
||||||
|
}
|
||||||
|
} while (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// do vertical convolution and decimation and write the result to the destination image
|
||||||
|
for (int k = 0; k < PD_SZ; k++)
|
||||||
|
rows[k] = buf + ((y*2 - PD_SZ/2 + k - sy0) % PD_SZ)*bufstep;
|
||||||
|
row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; row3 = rows[3]; row4 = rows[4];
|
||||||
|
|
||||||
|
pyrDownVec1<T, WT>()(row0, row1, row2, row3, row4, dst, _dst_width);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// the algorithm is copied from imgproc/src/pyramids.cpp,
|
||||||
|
// in the function template void cv::pyrUp_
|
||||||
|
template<typename T, typename WT>
|
||||||
|
inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int cn)
|
||||||
|
{
|
||||||
|
const int PU_SZ = 3;
|
||||||
|
|
||||||
|
int bufstep = ((dst_width+1)*cn + 15) & -16;
|
||||||
|
std::vector<WT> _buf(bufstep*PU_SZ + 16);
|
||||||
|
WT* buf = (WT*)(((size_t)_buf.data() + 15) & -16);
|
||||||
|
std::vector<int> _dtab(src_width*cn);
|
||||||
|
int* dtab = _dtab.data();
|
||||||
|
WT* rows[PU_SZ];
|
||||||
|
|
||||||
|
if( std::abs(dst_width - src_width*2) != dst_width % 2 ||
|
||||||
|
std::abs(dst_height - src_height*2) != dst_height % 2)
|
||||||
|
{
|
||||||
|
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||||
|
}
|
||||||
|
int k, x, sy0 = -PU_SZ/2, sy = sy0;
|
||||||
|
|
||||||
|
src_width *= cn;
|
||||||
|
dst_width *= cn;
|
||||||
|
|
||||||
|
for( x = 0; x < src_width; x++ )
|
||||||
|
dtab[x] = (x/cn)*2*cn + x % cn;
|
||||||
|
|
||||||
|
for( int y = 0; y < src_height; y++ )
|
||||||
|
{
|
||||||
|
T* dst0 = reinterpret_cast<T*>(dst_data + dst_step * (y*2));
|
||||||
|
T* dst1 = reinterpret_cast<T*>(dst_data + dst_step * (std::min(y*2+1, dst_height-1)));
|
||||||
|
WT *row0, *row1, *row2;
|
||||||
|
|
||||||
|
// fill the ring buffer (horizontal convolution and decimation)
|
||||||
|
for( ; sy <= y + 1; sy++ )
|
||||||
|
{
|
||||||
|
WT* row = buf + ((sy - sy0) % PU_SZ)*bufstep;
|
||||||
|
int _sy = borderInterpolate(sy*2, src_height*2, (int)BORDER_REFLECT_101)/2;
|
||||||
|
const T* src = reinterpret_cast<const T*>(src_data + src_step * _sy);
|
||||||
|
|
||||||
|
if( src_width == cn )
|
||||||
|
{
|
||||||
|
for( x = 0; x < cn; x++ )
|
||||||
|
row[x] = row[x + cn] = src[x]*8;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for( x = 0; x < cn; x++ )
|
||||||
|
{
|
||||||
|
int dx = dtab[x];
|
||||||
|
WT t0 = src[x]*6 + src[x + cn]*2;
|
||||||
|
WT t1 = (src[x] + src[x + cn])*4;
|
||||||
|
row[dx] = t0; row[dx + cn] = t1;
|
||||||
|
dx = dtab[src_width - cn + x];
|
||||||
|
int sx = src_width - cn + x;
|
||||||
|
t0 = src[sx - cn] + src[sx]*7;
|
||||||
|
t1 = src[sx]*8;
|
||||||
|
row[dx] = t0; row[dx + cn] = t1;
|
||||||
|
|
||||||
|
if (dst_width > src_width*2)
|
||||||
|
{
|
||||||
|
row[(dst_width-1) * cn + x] = row[dx + cn];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pyrUpVec0<T, WT>()(src, row, reinterpret_cast<const uint*>(dtab), cn, src_width - cn);
|
||||||
|
}
|
||||||
|
|
||||||
|
// do vertical convolution and decimation and write the result to the destination image
|
||||||
|
for( k = 0; k < PU_SZ; k++ )
|
||||||
|
rows[k] = buf + ((y - PU_SZ/2 + k - sy0) % PU_SZ)*bufstep;
|
||||||
|
row0 = rows[0]; row1 = rows[1]; row2 = rows[2];
|
||||||
|
|
||||||
|
pyrUpVec1<T, WT>()(row0, row1, row2, dst0, dst1, dst_width);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dst_height > src_height*2)
|
||||||
|
{
|
||||||
|
T* dst0 = reinterpret_cast<T*>(dst_data + dst_step * (src_height*2-2));
|
||||||
|
T* dst2 = reinterpret_cast<T*>(dst_data + dst_step * (src_height*2));
|
||||||
|
|
||||||
|
for(x = 0; x < dst_width ; x++ )
|
||||||
|
{
|
||||||
|
dst2[x] = dst0[x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return CV_HAL_ERROR_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
|
||||||
|
{
|
||||||
|
if (border_type == BORDER_CONSTANT || (depth == CV_32F && cn == 1))
|
||||||
|
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||||
|
|
||||||
|
switch (depth)
|
||||||
|
{
|
||||||
|
case CV_8U:
|
||||||
|
return pyrDown<uchar, int>(src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, cn, border_type);
|
||||||
|
case CV_16S:
|
||||||
|
return pyrDown<short, int>(src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, cn, border_type);
|
||||||
|
case CV_32F:
|
||||||
|
return pyrDown<float, float>(src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, cn, border_type);
|
||||||
|
}
|
||||||
|
|
||||||
|
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
|
||||||
|
{
|
||||||
|
if (border_type != BORDER_DEFAULT)
|
||||||
|
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||||
|
|
||||||
|
switch (depth)
|
||||||
|
{
|
||||||
|
case CV_8U:
|
||||||
|
return pyrUp<uchar, int>(src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, cn);
|
||||||
|
case CV_16S:
|
||||||
|
return pyrUp<short, int>(src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, cn);
|
||||||
|
case CV_32F:
|
||||||
|
return pyrUp<float, float>(src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, cn);
|
||||||
|
}
|
||||||
|
|
||||||
|
return CV_HAL_ERROR_NOT_IMPLEMENTED;
|
||||||
|
}
|
||||||
|
|
||||||
|
}}}
|
||||||
|
|
||||||
|
#endif
|
@ -1274,6 +1274,26 @@ inline int hal_ni_pyrdown_offset(const uchar* src_data, size_t src_step, int src
|
|||||||
#define cv_hal_pyrdown_offset hal_ni_pyrdown_offset
|
#define cv_hal_pyrdown_offset hal_ni_pyrdown_offset
|
||||||
//! @endcond
|
//! @endcond
|
||||||
|
|
||||||
|
/**
|
||||||
|
@brief Perform Gaussian Blur and upsampling for input tile.
|
||||||
|
@param depth Depths of source and destination image
|
||||||
|
@param src_data Source image data
|
||||||
|
@param src_step Source image step
|
||||||
|
@param dst_data Destination image data
|
||||||
|
@param dst_step Destination image step
|
||||||
|
@param src_width Source image width
|
||||||
|
@param src_height Source image height
|
||||||
|
@param dst_width Destination image width
|
||||||
|
@param dst_height Destination image height
|
||||||
|
@param cn Number of channels
|
||||||
|
@param border_type Border type
|
||||||
|
*/
|
||||||
|
inline int hal_ni_pyrup(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
|
||||||
|
|
||||||
|
//! @cond IGNORED
|
||||||
|
#define cv_hal_pyrup hal_ni_pyrup
|
||||||
|
//! @endcond
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@brief Canny edge detector
|
@brief Canny edge detector
|
||||||
@param src_data Source image data
|
@param src_data Source image data
|
||||||
|
@ -1388,6 +1388,8 @@ void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderT
|
|||||||
Mat dst = _dst.getMat();
|
Mat dst = _dst.getMat();
|
||||||
int depth = src.depth();
|
int depth = src.depth();
|
||||||
|
|
||||||
|
CALL_HAL(pyrUp, cv_hal_pyrup, src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, depth, src.channels(), borderType);
|
||||||
|
|
||||||
#ifdef HAVE_IPP
|
#ifdef HAVE_IPP
|
||||||
bool isolated = (borderType & BORDER_ISOLATED) != 0;
|
bool isolated = (borderType & BORDER_ISOLATED) != 0;
|
||||||
int borderTypeNI = borderType & ~BORDER_ISOLATED;
|
int borderTypeNI = borderType & ~BORDER_ISOLATED;
|
||||||
|
Loading…
Reference in New Issue
Block a user