Merge pull request #26923 from dkurt:merge_rvv_opt

Further optimization of cv::merge RVV HAL for 8U and 16S #26923

### Pull Request Readiness Checklist


* Banana Pi BF3 (SpacemiT K1) RISC-V
* Compiler: Syntacore Clang 18.1.4 (build 2024.12)

```
Geometric mean (ms)

                     Name of Test                       baseline   pr       pr
                                                         merge              vs    
                                                                         baseline
                                                                          merge
                                                                        (x-factor)
merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 2)      0.013   0.003     3.76   
merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 3)      0.020   0.006     3.46   
merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 4)      0.026   0.010     2.61   
merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 5)      0.043   0.028     1.56   
merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 6)      0.054   0.035     1.53   
merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 7)      0.065   0.050     1.30   
merge::Size_SrcDepth_DstChannels::(127x61, 8UC1, 8)      0.070   0.036     1.95   
merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 2)     0.015   0.008     1.82   
merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 3)     0.022   0.015     1.48   
merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 4)     0.029   0.018     1.63   
merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 5)     0.067   0.044     1.54   
merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 6)     0.088   0.056     1.58   
merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 7)     0.104   0.076     1.38   
merge::Size_SrcDepth_DstChannels::(127x61, 16SC1, 8)     0.116   0.065     1.79   
merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 2)     0.421   0.176     2.39   
merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 3)     0.792   0.284     2.79   
merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 4)     1.090   0.370     2.95   
merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 5)     1.835   1.399     1.31   
merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 6)     2.389   1.776     1.35   
merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 7)     3.000   2.471     1.21   
merge::Size_SrcDepth_DstChannels::(640x480, 8UC1, 8)     3.178   2.104     1.51   
merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 2)    0.490   0.377     1.30   
merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 3)    1.348   0.602     2.24   
merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 4)    1.827   0.813     2.25   
merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 5)    3.283   2.692     1.22   
merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 6)    4.922   3.334     1.48   
merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 7)    5.725   4.399     1.30   
merge::Size_SrcDepth_DstChannels::(640x480, 16SC1, 8)    6.278   4.748     1.32   
merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 2)    1.267   0.603     2.10   
merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 3)    2.394   0.934     2.56   
merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 4)    3.236   1.434     2.26   
merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 5)    5.398   4.345     1.24   
merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 6)    7.127   5.459     1.31   
merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 7)    8.590   7.298     1.18   
merge::Size_SrcDepth_DstChannels::(1280x720, 8UC1, 8)    9.360   6.152     1.52   
merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 2)   1.482   1.242     1.19   
merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 3)   4.008   1.817     2.21   
merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 4)   6.079   2.468     2.46   
merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 5)   11.300  8.644     1.31   
merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 6)   15.125  12.126    1.25   
merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 7)   17.555  14.804    1.19   
merge::Size_SrcDepth_DstChannels::(1280x720, 16SC1, 8)   18.890  14.163    1.33   
merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 2)   2.910   1.326     2.19   
merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 3)   5.351   1.997     2.68   
merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 4)   7.290   2.629     2.77   
merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 5)   12.426  9.611     1.29   
merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 6)   16.453  12.162    1.35   
merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 7)   19.420  16.190    1.20   
merge::Size_SrcDepth_DstChannels::(1920x1080, 8UC1, 8)   20.588  13.699    1.50   
merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 2)  3.400   2.640     1.29   
merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 3)  8.986   3.952     2.27   
merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 4)  11.972  5.273     2.27   
merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 5)  20.544  17.996    1.14   
merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 6)  28.677  22.086    1.30   
merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 7)  32.958  27.713    1.19   
merge::Size_SrcDepth_DstChannels::(1920x1080, 16SC1, 8)  36.499  27.439    1.33
```

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Dmitry Kurtaev 2025-02-20 17:28:28 +03:00 committed by GitHub
parent 58b14294b5
commit 7a2b048c92
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -17,236 +17,195 @@ namespace cv { namespace cv_hal_rvv {
#undef cv_hal_merge64s
#define cv_hal_merge64s cv::cv_hal_rvv::merge64s
#if defined __GNUC__
__attribute__((optimize("no-tree-vectorize")))
#endif
#if defined __clang__ && __clang_major__ < 18
#define OPENCV_HAL_IMPL_RVV_VCREATE_x2(suffix, width, v0, v1) \
__riscv_vset_v_##suffix##m##width##_##suffix##m##width##x2(seg, 0, v0); \
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x2(seg, 1, v1);
#define OPENCV_HAL_IMPL_RVV_VCREATE_x3(suffix, width, v0, v1, v2) \
__riscv_vset_v_##suffix##m##width##_##suffix##m##width##x3(seg, 0, v0); \
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x3(seg, 1, v1); \
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x3(seg, 2, v2);
#define OPENCV_HAL_IMPL_RVV_VCREATE_x4(suffix, width, v0, v1, v2, v3) \
__riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(seg, 0, v0); \
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(seg, 1, v1); \
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(seg, 2, v2); \
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(seg, 3, v3);
#define __riscv_vcreate_v_u8m4x2(v0, v1) OPENCV_HAL_IMPL_RVV_VCREATE_x2(u8, 4, v0, v1)
#define __riscv_vcreate_v_u8m2x3(v0, v1, v2) OPENCV_HAL_IMPL_RVV_VCREATE_x3(u8, 2, v0, v1, v2)
#define __riscv_vcreate_v_u8m2x4(v0, v1, v2, v3) OPENCV_HAL_IMPL_RVV_VCREATE_x4(u8, 2, v0, v1, v2, v3)
#define __riscv_vcreate_v_u16m4x2(v0, v1) OPENCV_HAL_IMPL_RVV_VCREATE_x2(u16, 4, v0, v1)
#define __riscv_vcreate_v_u16m2x3(v0, v1, v2) OPENCV_HAL_IMPL_RVV_VCREATE_x3(u16, 2, v0, v1, v2)
#define __riscv_vcreate_v_u16m2x4(v0, v1, v2, v3) OPENCV_HAL_IMPL_RVV_VCREATE_x4(u16, 2, v0, v1, v2, v3)
#endif // clang < 18
inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
int k = cn % 4 ? cn % 4 : 4;
int i = 0;
int vl = __riscv_vsetvlmax_e8m1();
if( k == 1 )
int vl = 0;
if (cn == 1)
{
const uchar* src0 = src[0];
for( ; i <= len - vl; i += vl)
for (int i = 0; i < len; i += vl)
{
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
vl = __riscv_vsetvl_e8m8(len - i);
__riscv_vse8_v_u8m8(dst + i, __riscv_vle8_v_u8m8(src0 + i, vl), vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++)
dst[i*cn] = src0[i];
}
else if( k == 2 )
else if (cn == 2)
{
const uchar *src0 = src[0], *src1 = src[1];
for( ; i <= len - vl; i += vl)
for (int i = 0; i < len; i += vl)
{
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
__riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
{
dst[i*cn] = src0[i];
dst[i*cn+1] = src1[i];
vl = __riscv_vsetvl_e8m4(len - i);
vuint8m4x2_t seg = __riscv_vcreate_v_u8m4x2(
__riscv_vle8_v_u8m4(src0 + i, vl),
__riscv_vle8_v_u8m4(src1 + i, vl)
);
__riscv_vsseg2e8_v_u8m4x2(dst + i * cn, seg, vl);
}
}
else if( k == 3 )
else if (cn == 3)
{
const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2];
for( ; i <= len - vl; i += vl)
for (int i = 0; i < len; i += vl)
{
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
__riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
__riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*cn, c, vl);
vl = __riscv_vsetvl_e8m2(len - i);
vuint8m2x3_t seg = __riscv_vcreate_v_u8m2x3(
__riscv_vle8_v_u8m2(src0 + i, vl),
__riscv_vle8_v_u8m2(src1 + i, vl),
__riscv_vle8_v_u8m2(src2 + i, vl)
);
__riscv_vsseg3e8_v_u8m2x3(dst + i * cn, seg, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
}
else if (cn == 4)
{
const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
for (int i = 0; i < len; i += vl)
{
dst[i*cn] = src0[i];
dst[i*cn+1] = src1[i];
dst[i*cn+2] = src2[i];
vl = __riscv_vsetvl_e8m2(len - i);
vuint8m2x4_t seg = __riscv_vcreate_v_u8m2x4(
__riscv_vle8_v_u8m2(src0 + i, vl),
__riscv_vle8_v_u8m2(src1 + i, vl),
__riscv_vle8_v_u8m2(src2 + i, vl),
__riscv_vle8_v_u8m2(src3 + i, vl)
);
__riscv_vsseg4e8_v_u8m2x4(dst + i * cn, seg, vl);
}
}
else
{
const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
for( ; i <= len - vl; i += vl)
int k = 0;
for (; k <= cn - 4; k += 4)
{
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
__riscv_vsse8_v_u8m1(dst + i*cn, sizeof(uchar)*cn, a, vl);
__riscv_vsse8_v_u8m1(dst + i*cn + 1, sizeof(uchar)*cn, b, vl);
__riscv_vsse8_v_u8m1(dst + i*cn + 2, sizeof(uchar)*cn, c, vl);
__riscv_vsse8_v_u8m1(dst + i*cn + 3, sizeof(uchar)*cn, d, vl);
const uchar *src0 = src[k], *src1 = src[k + 1], *src2 = src[k + 2], *src3 = src[k + 3];
for (int i = 0; i < len; i += vl)
{
vl = __riscv_vsetvl_e8m2(len - i);
vuint8m2x4_t seg = __riscv_vcreate_v_u8m2x4(
__riscv_vle8_v_u8m2(src0 + i, vl),
__riscv_vle8_v_u8m2(src1 + i, vl),
__riscv_vle8_v_u8m2(src2 + i, vl),
__riscv_vle8_v_u8m2(src3 + i, vl)
);
__riscv_vssseg4e8_v_u8m2x4(dst + k + i * cn, cn, seg, vl);
}
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
for (; k < cn; ++k)
{
dst[i*cn] = src0[i];
dst[i*cn+1] = src1[i];
dst[i*cn+2] = src2[i];
dst[i*cn+3] = src3[i];
}
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; k < cn; k += 4 )
{
const uchar *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
i = 0;
for( ; i <= len - vl; i += vl)
{
auto a = __riscv_vle8_v_u8m1(src0 + i, vl);
auto b = __riscv_vle8_v_u8m1(src1 + i, vl);
auto c = __riscv_vle8_v_u8m1(src2 + i, vl);
auto d = __riscv_vle8_v_u8m1(src3 + i, vl);
__riscv_vsse8_v_u8m1(dst + k+i*cn, sizeof(uchar)*cn, a, vl);
__riscv_vsse8_v_u8m1(dst + k+i*cn + 1, sizeof(uchar)*cn, b, vl);
__riscv_vsse8_v_u8m1(dst + k+i*cn + 2, sizeof(uchar)*cn, c, vl);
__riscv_vsse8_v_u8m1(dst + k+i*cn + 3, sizeof(uchar)*cn, d, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
{
dst[k+i*cn] = src0[i];
dst[k+i*cn+1] = src1[i];
dst[k+i*cn+2] = src2[i];
dst[k+i*cn+3] = src3[i];
const uchar* srcK = src[k];
for (int i = 0; i < len; i += vl)
{
vl = __riscv_vsetvl_e8m2(len - i);
vuint8m2_t seg = __riscv_vle8_v_u8m2(srcK + i, vl);
__riscv_vsse8_v_u8m2(dst + k + i * cn, cn, seg, vl);
}
}
}
return CV_HAL_ERROR_OK;
}
#if defined __GNUC__
__attribute__((optimize("no-tree-vectorize")))
#endif
inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
int k = cn % 4 ? cn % 4 : 4;
int i = 0;
int vl = __riscv_vsetvlmax_e16m1();
if( k == 1 )
int vl = 0;
if (cn == 1)
{
const ushort* src0 = src[0];
for( ; i <= len - vl; i += vl)
for (int i = 0; i < len; i += vl)
{
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
vl = __riscv_vsetvl_e16m8(len - i);
__riscv_vse16_v_u16m8(dst + i, __riscv_vle16_v_u16m8(src0 + i, vl), vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++)
dst[i*cn] = src0[i];
}
else if( k == 2 )
else if (cn == 2)
{
const ushort *src0 = src[0], *src1 = src[1];
for( ; i <= len - vl; i += vl)
for (int i = 0; i < len; i += vl)
{
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
__riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
{
dst[i*cn] = src0[i];
dst[i*cn+1] = src1[i];
vl = __riscv_vsetvl_e16m4(len - i);
vuint16m4x2_t seg = __riscv_vcreate_v_u16m4x2(
__riscv_vle16_v_u16m4(src0 + i, vl),
__riscv_vle16_v_u16m4(src1 + i, vl)
);
__riscv_vsseg2e16_v_u16m4x2(dst + i * cn, seg, vl);
}
}
else if( k == 3 )
else if (cn == 3)
{
const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2];
for( ; i <= len - vl; i += vl)
for (int i = 0; i < len; i += vl)
{
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
__riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
__riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*cn, c, vl);
vl = __riscv_vsetvl_e16m2(len - i);
vuint16m2x3_t seg = __riscv_vcreate_v_u16m2x3(
__riscv_vle16_v_u16m2(src0 + i, vl),
__riscv_vle16_v_u16m2(src1 + i, vl),
__riscv_vle16_v_u16m2(src2 + i, vl)
);
__riscv_vsseg3e16_v_u16m2x3(dst + i * cn, seg, vl);
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
}
else if (cn == 4)
{
const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
for (int i = 0; i < len; i += vl)
{
dst[i*cn] = src0[i];
dst[i*cn+1] = src1[i];
dst[i*cn+2] = src2[i];
vl = __riscv_vsetvl_e16m2(len - i);
vuint16m2x4_t seg = __riscv_vcreate_v_u16m2x4(
__riscv_vle16_v_u16m2(src0 + i, vl),
__riscv_vle16_v_u16m2(src1 + i, vl),
__riscv_vle16_v_u16m2(src2 + i, vl),
__riscv_vle16_v_u16m2(src3 + i, vl)
);
__riscv_vsseg4e16_v_u16m2x4(dst + i * cn, seg, vl);
}
}
else
{
const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
for( ; i <= len - vl; i += vl)
int k = 0;
for (; k <= cn - 4; k += 4)
{
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
__riscv_vsse16_v_u16m1(dst + i*cn, sizeof(ushort)*cn, a, vl);
__riscv_vsse16_v_u16m1(dst + i*cn + 1, sizeof(ushort)*cn, b, vl);
__riscv_vsse16_v_u16m1(dst + i*cn + 2, sizeof(ushort)*cn, c, vl);
__riscv_vsse16_v_u16m1(dst + i*cn + 3, sizeof(ushort)*cn, d, vl);
const ushort *src0 = src[k], *src1 = src[k + 1], *src2 = src[k + 2], *src3 = src[k + 3];
for (int i = 0; i < len; i += vl)
{
vl = __riscv_vsetvl_e16m2(len - i);
vuint16m2x4_t seg = __riscv_vcreate_v_u16m2x4(
__riscv_vle16_v_u16m2(src0 + i, vl),
__riscv_vle16_v_u16m2(src1 + i, vl),
__riscv_vle16_v_u16m2(src2 + i, vl),
__riscv_vle16_v_u16m2(src3 + i, vl)
);
__riscv_vssseg4e16_v_u16m2x4(dst + k + i * cn, cn * sizeof(ushort), seg, vl);
}
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; i < len; i++ )
for (; k < cn; ++k)
{
dst[i*cn] = src0[i];
dst[i*cn+1] = src1[i];
dst[i*cn+2] = src2[i];
dst[i*cn+3] = src3[i];
}
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for( ; k < cn; k += 4 )
{
const uint16_t *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
i = 0;
for( ; i <= len - vl; i += vl)
{
auto a = __riscv_vle16_v_u16m1(src0 + i, vl);
auto b = __riscv_vle16_v_u16m1(src1 + i, vl);
auto c = __riscv_vle16_v_u16m1(src2 + i, vl);
auto d = __riscv_vle16_v_u16m1(src3 + i, vl);
__riscv_vsse16_v_u16m1(dst + k+i*cn, sizeof(ushort)*cn, a, vl);
__riscv_vsse16_v_u16m1(dst + k+i*cn + 1, sizeof(ushort)*cn, b, vl);
__riscv_vsse16_v_u16m1(dst + k+i*cn + 2, sizeof(ushort)*cn, c, vl);
__riscv_vsse16_v_u16m1(dst + k+i*cn + 3, sizeof(ushort)*cn, d, vl);
}
for( ; i < len; i++ )
{
dst[k+i*cn] = src0[i];
dst[k+i*cn+1] = src1[i];
dst[k+i*cn+2] = src2[i];
dst[k+i*cn+3] = src3[i];
const ushort* srcK = src[k];
for (int i = 0; i < len; i += vl)
{
vl = __riscv_vsetvl_e16m2(len - i);
vuint16m2_t seg = __riscv_vle16_v_u16m2(srcK + i, vl);
__riscv_vsse16_v_u16m2(dst + k + i * cn, cn * sizeof(ushort), seg, vl);
}
}
}
return CV_HAL_ERROR_OK;