mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 14:36:36 +08:00
Merge pull request #26923 from dkurt:merge_rvv_opt
Further optimization of cv::merge RVV HAL for 8U and 16S #26923 ### Pull Request Readiness Checklist * Banana Pi BF3 (SpacemiT K1) RISC-V * Compiler: Syntacore Clang 18.1.4 (build 2024.12) ``` Geometric mean (ms) Name of Test baseline pr pr merge vs baseline merge (x-factor) merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 2) 0.013 0.003 3.76 merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 3) 0.020 0.006 3.46 merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 4) 0.026 0.010 2.61 merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 5) 0.043 0.028 1.56 merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 6) 0.054 0.035 1.53 merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 7) 0.065 0.050 1.30 merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 8) 0.070 0.036 1.95 merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 2) 0.015 0.008 1.82 merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 3) 0.022 0.015 1.48 merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 4) 0.029 0.018 1.63 merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 5) 0.067 0.044 1.54 merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 6) 0.088 0.056 1.58 merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 7) 0.104 0.076 1.38 merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 8) 0.116 0.065 1.79 merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 2) 0.421 0.176 2.39 merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 3) 0.792 0.284 2.79 merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 4) 1.090 0.370 2.95 merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 5) 1.835 1.399 1.31 merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 6) 2.389 1.776 1.35 merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 7) 3.000 2.471 1.21 merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 8) 3.178 2.104 1.51 merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 2) 0.490 0.377 1.30 merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 3) 1.348 0.602 2.24 merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 4) 1.827 0.813 2.25 merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 5) 3.283 2.692 1.22 merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 6) 4.922 3.334 1.48 merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 7) 5.725 4.399 1.30 merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 8) 6.278 4.748 1.32 merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 2) 1.267 0.603 2.10 merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 3) 2.394 0.934 2.56 merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 4) 3.236 1.434 2.26 merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 5) 5.398 4.345 1.24 merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 6) 7.127 5.459 1.31 merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 7) 8.590 7.298 1.18 merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 8) 9.360 6.152 1.52 merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 2) 1.482 1.242 1.19 merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 3) 4.008 1.817 2.21 merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 4) 6.079 2.468 2.46 merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 5) 11.300 8.644 1.31 merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 6) 15.125 12.126 1.25 merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 7) 17.555 14.804 1.19 merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 8) 18.890 14.163 1.33 merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 2) 2.910 1.326 2.19 merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 3) 5.351 1.997 2.68 merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 4) 7.290 2.629 2.77 merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 5) 12.426 9.611 1.29 merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 6) 16.453 12.162 1.35 merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 7) 19.420 16.190 1.20 merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 8) 20.588 13.699 1.50 merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 2) 3.400 2.640 1.29 merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 3) 8.986 3.952 2.27 merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 4) 11.972 5.273 2.27 merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 5) 20.544 17.996 1.14 merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 6) 28.677 22.086 1.30 merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 7) 32.958 27.713 1.19 merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 8) 36.499 27.439 1.33 ``` See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
58b14294b5
commit
7a2b048c92
317
3rdparty/hal_rvv/hal_rvv_1p0/merge.hpp
vendored
317
3rdparty/hal_rvv/hal_rvv_1p0/merge.hpp
vendored
@ -17,236 +17,195 @@ namespace cv { namespace cv_hal_rvv {
|
||||
#undef cv_hal_merge64s
|
||||
#define cv_hal_merge64s cv::cv_hal_rvv::merge64s
|
||||
|
||||
#if defined __GNUC__
|
||||
__attribute__((optimize("no-tree-vectorize")))
|
||||
#endif
|
||||
#if defined __clang__ && __clang_major__ < 18
|
||||
#define OPENCV_HAL_IMPL_RVV_VCREATE_x2(suffix, width, v0, v1) \
|
||||
__riscv_vset_v_##suffix##m##width##_##suffix##m##width##x2(seg, 0, v0); \
|
||||
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x2(seg, 1, v1);
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_VCREATE_x3(suffix, width, v0, v1, v2) \
|
||||
__riscv_vset_v_##suffix##m##width##_##suffix##m##width##x3(seg, 0, v0); \
|
||||
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x3(seg, 1, v1); \
|
||||
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x3(seg, 2, v2);
|
||||
|
||||
#define OPENCV_HAL_IMPL_RVV_VCREATE_x4(suffix, width, v0, v1, v2, v3) \
|
||||
__riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(seg, 0, v0); \
|
||||
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(seg, 1, v1); \
|
||||
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(seg, 2, v2); \
|
||||
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(seg, 3, v3);
|
||||
|
||||
#define __riscv_vcreate_v_u8m4x2(v0, v1) OPENCV_HAL_IMPL_RVV_VCREATE_x2(u8, 4, v0, v1)
|
||||
#define __riscv_vcreate_v_u8m2x3(v0, v1, v2) OPENCV_HAL_IMPL_RVV_VCREATE_x3(u8, 2, v0, v1, v2)
|
||||
#define __riscv_vcreate_v_u8m2x4(v0, v1, v2, v3) OPENCV_HAL_IMPL_RVV_VCREATE_x4(u8, 2, v0, v1, v2, v3)
|
||||
#define __riscv_vcreate_v_u16m4x2(v0, v1) OPENCV_HAL_IMPL_RVV_VCREATE_x2(u16, 4, v0, v1)
|
||||
#define __riscv_vcreate_v_u16m2x3(v0, v1, v2) OPENCV_HAL_IMPL_RVV_VCREATE_x3(u16, 2, v0, v1, v2)
|
||||
#define __riscv_vcreate_v_u16m2x4(v0, v1, v2, v3) OPENCV_HAL_IMPL_RVV_VCREATE_x4(u16, 2, v0, v1, v2, v3)
|
||||
#endif // clang < 18
|
||||
|
||||
inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
|
||||
int k = cn % 4 ? cn % 4 : 4;
|
||||
int i = 0;
|
||||
int vl = __riscv_vsetvlmax_e8m1();
|
||||
if( k == 1 )
|
||||
int vl = 0;
|
||||
if (cn == 1)
|
||||
{
|
||||
const uchar* src0 = src[0];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
for (int i = 0; i < len; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
|
||||
vl = __riscv_vsetvl_e8m8(len - i);
|
||||
__riscv_vse8_v_u8m8(dst + i, __riscv_vle8_v_u8m8(src0 + i, vl), vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++)
|
||||
dst[i*cn] = src0[i];
|
||||
}
|
||||
else if( k == 2 )
|
||||
else if (cn == 2)
|
||||
{
|
||||
const uchar *src0 = src[0], *src1 = src[1];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
for (int i = 0; i < len; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
|
||||
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
dst[i*cn] = src0[i];
|
||||
dst[i*cn+1] = src1[i];
|
||||
vl = __riscv_vsetvl_e8m4(len - i);
|
||||
vuint8m4x2_t seg = __riscv_vcreate_v_u8m4x2(
|
||||
__riscv_vle8_v_u8m4(src0 + i, vl),
|
||||
__riscv_vle8_v_u8m4(src1 + i, vl)
|
||||
);
|
||||
__riscv_vsseg2e8_v_u8m4x2(dst + i * cn, seg, vl);
|
||||
}
|
||||
}
|
||||
else if( k == 3 )
|
||||
else if (cn == 3)
|
||||
{
|
||||
const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
for (int i = 0; i < len; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
|
||||
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
|
||||
auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*cn, c, vl);
|
||||
vl = __riscv_vsetvl_e8m2(len - i);
|
||||
vuint8m2x3_t seg = __riscv_vcreate_v_u8m2x3(
|
||||
__riscv_vle8_v_u8m2(src0 + i, vl),
|
||||
__riscv_vle8_v_u8m2(src1 + i, vl),
|
||||
__riscv_vle8_v_u8m2(src2 + i, vl)
|
||||
);
|
||||
__riscv_vsseg3e8_v_u8m2x3(dst + i * cn, seg, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
}
|
||||
else if (cn == 4)
|
||||
{
|
||||
const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
||||
for (int i = 0; i < len; i += vl)
|
||||
{
|
||||
dst[i*cn] = src0[i];
|
||||
dst[i*cn+1] = src1[i];
|
||||
dst[i*cn+2] = src2[i];
|
||||
vl = __riscv_vsetvl_e8m2(len - i);
|
||||
vuint8m2x4_t seg = __riscv_vcreate_v_u8m2x4(
|
||||
__riscv_vle8_v_u8m2(src0 + i, vl),
|
||||
__riscv_vle8_v_u8m2(src1 + i, vl),
|
||||
__riscv_vle8_v_u8m2(src2 + i, vl),
|
||||
__riscv_vle8_v_u8m2(src3 + i, vl)
|
||||
);
|
||||
__riscv_vsseg4e8_v_u8m2x4(dst + i * cn, seg, vl);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
int k = 0;
|
||||
for (; k <= cn - 4; k += 4)
|
||||
{
|
||||
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
|
||||
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
|
||||
auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
|
||||
auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*cn, c, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + i*cn + 3, sizeof(uchar)*cn, d, vl);
|
||||
const uchar *src0 = src[k], *src1 = src[k + 1], *src2 = src[k + 2], *src3 = src[k + 3];
|
||||
for (int i = 0; i < len; i += vl)
|
||||
{
|
||||
vl = __riscv_vsetvl_e8m2(len - i);
|
||||
vuint8m2x4_t seg = __riscv_vcreate_v_u8m2x4(
|
||||
__riscv_vle8_v_u8m2(src0 + i, vl),
|
||||
__riscv_vle8_v_u8m2(src1 + i, vl),
|
||||
__riscv_vle8_v_u8m2(src2 + i, vl),
|
||||
__riscv_vle8_v_u8m2(src3 + i, vl)
|
||||
);
|
||||
__riscv_vssseg4e8_v_u8m2x4(dst + k + i * cn, cn, seg, vl);
|
||||
}
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
for (; k < cn; ++k)
|
||||
{
|
||||
dst[i*cn] = src0[i];
|
||||
dst[i*cn+1] = src1[i];
|
||||
dst[i*cn+2] = src2[i];
|
||||
dst[i*cn+3] = src3[i];
|
||||
}
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; k < cn; k += 4 )
|
||||
{
|
||||
const uchar *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
|
||||
i = 0;
|
||||
for( ; i <= len - vl; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
|
||||
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
|
||||
auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
|
||||
auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + k+i*cn, sizeof(uchar)*cn, a, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + k+i*cn + 1, sizeof(uchar)*cn, b, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + k+i*cn + 2, sizeof(uchar)*cn, c, vl);
|
||||
__riscv_vsse8_v_u8m1(dst + k+i*cn + 3, sizeof(uchar)*cn, d, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
dst[k+i*cn] = src0[i];
|
||||
dst[k+i*cn+1] = src1[i];
|
||||
dst[k+i*cn+2] = src2[i];
|
||||
dst[k+i*cn+3] = src3[i];
|
||||
const uchar* srcK = src[k];
|
||||
for (int i = 0; i < len; i += vl)
|
||||
{
|
||||
vl = __riscv_vsetvl_e8m2(len - i);
|
||||
vuint8m2_t seg = __riscv_vle8_v_u8m2(srcK + i, vl);
|
||||
__riscv_vsse8_v_u8m2(dst + k + i * cn, cn, seg, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
#if defined __GNUC__
|
||||
__attribute__((optimize("no-tree-vectorize")))
|
||||
#endif
|
||||
inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
|
||||
int k = cn % 4 ? cn % 4 : 4;
|
||||
int i = 0;
|
||||
int vl = __riscv_vsetvlmax_e16m1();
|
||||
if( k == 1 )
|
||||
int vl = 0;
|
||||
if (cn == 1)
|
||||
{
|
||||
const ushort* src0 = src[0];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
for (int i = 0; i < len; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
|
||||
vl = __riscv_vsetvl_e16m8(len - i);
|
||||
__riscv_vse16_v_u16m8(dst + i, __riscv_vle16_v_u16m8(src0 + i, vl), vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++)
|
||||
dst[i*cn] = src0[i];
|
||||
}
|
||||
else if( k == 2 )
|
||||
else if (cn == 2)
|
||||
{
|
||||
const ushort *src0 = src[0], *src1 = src[1];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
for (int i = 0; i < len; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
|
||||
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
dst[i*cn] = src0[i];
|
||||
dst[i*cn+1] = src1[i];
|
||||
vl = __riscv_vsetvl_e16m4(len - i);
|
||||
vuint16m4x2_t seg = __riscv_vcreate_v_u16m4x2(
|
||||
__riscv_vle16_v_u16m4(src0 + i, vl),
|
||||
__riscv_vle16_v_u16m4(src1 + i, vl)
|
||||
);
|
||||
__riscv_vsseg2e16_v_u16m4x2(dst + i * cn, seg, vl);
|
||||
}
|
||||
}
|
||||
else if( k == 3 )
|
||||
else if (cn == 3)
|
||||
{
|
||||
const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
for (int i = 0; i < len; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
|
||||
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
|
||||
auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*cn, c, vl);
|
||||
vl = __riscv_vsetvl_e16m2(len - i);
|
||||
vuint16m2x3_t seg = __riscv_vcreate_v_u16m2x3(
|
||||
__riscv_vle16_v_u16m2(src0 + i, vl),
|
||||
__riscv_vle16_v_u16m2(src1 + i, vl),
|
||||
__riscv_vle16_v_u16m2(src2 + i, vl)
|
||||
);
|
||||
__riscv_vsseg3e16_v_u16m2x3(dst + i * cn, seg, vl);
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
}
|
||||
else if (cn == 4)
|
||||
{
|
||||
const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
||||
for (int i = 0; i < len; i += vl)
|
||||
{
|
||||
dst[i*cn] = src0[i];
|
||||
dst[i*cn+1] = src1[i];
|
||||
dst[i*cn+2] = src2[i];
|
||||
vl = __riscv_vsetvl_e16m2(len - i);
|
||||
vuint16m2x4_t seg = __riscv_vcreate_v_u16m2x4(
|
||||
__riscv_vle16_v_u16m2(src0 + i, vl),
|
||||
__riscv_vle16_v_u16m2(src1 + i, vl),
|
||||
__riscv_vle16_v_u16m2(src2 + i, vl),
|
||||
__riscv_vle16_v_u16m2(src3 + i, vl)
|
||||
);
|
||||
__riscv_vsseg4e16_v_u16m2x4(dst + i * cn, seg, vl);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
||||
for( ; i <= len - vl; i += vl)
|
||||
int k = 0;
|
||||
for (; k <= cn - 4; k += 4)
|
||||
{
|
||||
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
|
||||
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
|
||||
auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
|
||||
auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*cn, c, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + i*cn + 3, sizeof(ushort)*cn, d, vl);
|
||||
const ushort *src0 = src[k], *src1 = src[k + 1], *src2 = src[k + 2], *src3 = src[k + 3];
|
||||
for (int i = 0; i < len; i += vl)
|
||||
{
|
||||
vl = __riscv_vsetvl_e16m2(len - i);
|
||||
vuint16m2x4_t seg = __riscv_vcreate_v_u16m2x4(
|
||||
__riscv_vle16_v_u16m2(src0 + i, vl),
|
||||
__riscv_vle16_v_u16m2(src1 + i, vl),
|
||||
__riscv_vle16_v_u16m2(src2 + i, vl),
|
||||
__riscv_vle16_v_u16m2(src3 + i, vl)
|
||||
);
|
||||
__riscv_vssseg4e16_v_u16m2x4(dst + k + i * cn, cn * sizeof(ushort), seg, vl);
|
||||
}
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
for (; k < cn; ++k)
|
||||
{
|
||||
dst[i*cn] = src0[i];
|
||||
dst[i*cn+1] = src1[i];
|
||||
dst[i*cn+2] = src2[i];
|
||||
dst[i*cn+3] = src3[i];
|
||||
}
|
||||
}
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for( ; k < cn; k += 4 )
|
||||
{
|
||||
const uint16_t *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
|
||||
i = 0;
|
||||
for( ; i <= len - vl; i += vl)
|
||||
{
|
||||
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
|
||||
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
|
||||
auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
|
||||
auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + k+i*cn, sizeof(ushort)*cn, a, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + k+i*cn + 1, sizeof(ushort)*cn, b, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + k+i*cn + 2, sizeof(ushort)*cn, c, vl);
|
||||
__riscv_vsse16_v_u16m1(dst + k+i*cn + 3, sizeof(ushort)*cn, d, vl);
|
||||
}
|
||||
for( ; i < len; i++ )
|
||||
{
|
||||
dst[k+i*cn] = src0[i];
|
||||
dst[k+i*cn+1] = src1[i];
|
||||
dst[k+i*cn+2] = src2[i];
|
||||
dst[k+i*cn+3] = src3[i];
|
||||
const ushort* srcK = src[k];
|
||||
for (int i = 0; i < len; i += vl)
|
||||
{
|
||||
vl = __riscv_vsetvl_e16m2(len - i);
|
||||
vuint16m2_t seg = __riscv_vle16_v_u16m2(srcK + i, vl);
|
||||
__riscv_vsse16_v_u16m2(dst + k + i * cn, cn * sizeof(ushort), seg, vl);
|
||||
}
|
||||
}
|
||||
}
|
||||
return CV_HAL_ERROR_OK;
|
||||
|
Loading…
Reference in New Issue
Block a user