Merge pull request #27229 from fengyuentau:4x/hal_rvv/transpose

HAL: implemented cv_hal_transpose in hal_rvv #27229

Checklists:

- [x] transpose2d_8u
- [x] transpose2d_16u
- [ ] ~transpose2d_8uC3~
- [x] transpose2d_32s
- [ ] ~transpose2d_16uC3~
- [x] transpose2d_32sC2
- [ ] ~transpose_32sC3~
- [ ] ~transpose_32sC4~
- [ ] ~transpose_32sC6~
- [ ] ~transpose_32sC8~
- [ ] ~inplace transpose~


### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Yuantao Feng 2025-04-22 16:03:26 +08:00 committed by GitHub
parent cd5a636459
commit 325e59bd4c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 234 additions and 0 deletions

View File

@ -49,6 +49,7 @@
#include "hal_rvv_1p0/div.hpp" // core
#include "hal_rvv_1p0/dotprod.hpp" // core
#include "hal_rvv_1p0/compare.hpp" // core
#include "hal_rvv_1p0/transpose.hpp" // core
#include "hal_rvv_1p0/moments.hpp" // imgproc
#include "hal_rvv_1p0/filter.hpp" // imgproc

View File

@ -0,0 +1,220 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
#ifndef OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED
#define OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED
#include <riscv_vector.h>
namespace cv { namespace cv_hal_rvv { namespace transpose {
#if defined (__clang__) && __clang_major__ < 18
#define OPENCV_HAL_IMPL_RVV_VCREATE_x4(suffix, width, v0, v1, v2, v3) \
__riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(v, 0, v0); \
v = __riscv_vset(v, 1, v1); \
v = __riscv_vset(v, 2, v2); \
v = __riscv_vset(v, 3, v3);
#define OPENCV_HAL_IMPL_RVV_VCREATE_x8(suffix, width, v0, v1, v2, v3, v4, v5, v6, v7) \
__riscv_vset_v_##suffix##m##width##_##suffix##m##width##x8(v, 0, v0); \
v = __riscv_vset(v, 1, v1); \
v = __riscv_vset(v, 2, v2); \
v = __riscv_vset(v, 3, v3); \
v = __riscv_vset(v, 4, v4); \
v = __riscv_vset(v, 5, v5); \
v = __riscv_vset(v, 6, v6); \
v = __riscv_vset(v, 7, v7);
#define __riscv_vcreate_v_u8m1x8(v0, v1, v2, v3, v4, v5, v6, v7) OPENCV_HAL_IMPL_RVV_VCREATE_x8(u8, 1, v0, v1, v2, v3, v4, v5, v6, v7)
#define __riscv_vcreate_v_u16m1x8(v0, v1, v2, v3, v4, v5, v6, v7) OPENCV_HAL_IMPL_RVV_VCREATE_x8(u16, 1, v0, v1, v2, v3, v4, v5, v6, v7)
#define __riscv_vcreate_v_i32m1x4(v0, v1, v2, v3) OPENCV_HAL_IMPL_RVV_VCREATE_x4(i32, 1, v0, v1, v2, v3)
#define __riscv_vcreate_v_i64m1x8(v0, v1, v2, v3, v4, v5, v6, v7) OPENCV_HAL_IMPL_RVV_VCREATE_x8(i64, 1, v0, v1, v2, v3, v4, v5, v6, v7)
#endif
static void transpose2d_8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
auto transpose_8u_8xVl = [](const uchar *src, size_t src_step, uchar *dst, size_t dst_step, const int vl) {
auto v0 = __riscv_vle8_v_u8m1(src, vl);
auto v1 = __riscv_vle8_v_u8m1(src + src_step, vl);
auto v2 = __riscv_vle8_v_u8m1(src + 2 * src_step, vl);
auto v3 = __riscv_vle8_v_u8m1(src + 3 * src_step, vl);
auto v4 = __riscv_vle8_v_u8m1(src + 4 * src_step, vl);
auto v5 = __riscv_vle8_v_u8m1(src + 5 * src_step, vl);
auto v6 = __riscv_vle8_v_u8m1(src + 6 * src_step, vl);
auto v7 = __riscv_vle8_v_u8m1(src + 7 * src_step, vl);
vuint8m1x8_t v = __riscv_vcreate_v_u8m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
__riscv_vssseg8e8(dst, dst_step, v, vl);
};
int h = 0, w = 0;
for (; h <= src_height - 8; h += 8) {
const uchar *src = src_data + h * src_step;
uchar *dst = dst_data + h;
int vl;
for (w = 0; w < src_width; w += vl) {
vl = __riscv_vsetvl_e8m1(src_width - w);
transpose_8u_8xVl(src + w, src_step, dst + w * dst_step, dst_step, vl);
}
}
for (; h < src_height; h++) {
const uchar *src = src_data + h * src_step;
uchar *dst = dst_data + h;
int vl;
for (w = 0; w < src_width; w += vl) {
vl = __riscv_vsetvl_e8m8(src_width - w);
auto v = __riscv_vle8_v_u8m8(src + w, vl);
__riscv_vsse8(dst + w * dst_step, dst_step, v, vl);
}
}
}
static void transpose2d_16u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
auto transpose_16u_8xVl = [](const ushort *src, size_t src_step, ushort *dst, size_t dst_step, const int vl) {
auto v0 = __riscv_vle16_v_u16m1(src, vl);
auto v1 = __riscv_vle16_v_u16m1(src + src_step, vl);
auto v2 = __riscv_vle16_v_u16m1(src + 2 * src_step, vl);
auto v3 = __riscv_vle16_v_u16m1(src + 3 * src_step, vl);
auto v4 = __riscv_vle16_v_u16m1(src + 4 * src_step, vl);
auto v5 = __riscv_vle16_v_u16m1(src + 5 * src_step, vl);
auto v6 = __riscv_vle16_v_u16m1(src + 6 * src_step, vl);
auto v7 = __riscv_vle16_v_u16m1(src + 7 * src_step, vl);
vuint16m1x8_t v = __riscv_vcreate_v_u16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
__riscv_vssseg8e16(dst, dst_step, v, vl);
};
size_t src_step_base = src_step / sizeof(ushort);
size_t dst_step_base = dst_step / sizeof(ushort);
int h = 0, w = 0;
for (; h <= src_height - 8; h += 8) {
const ushort *src = (const ushort*)(src_data) + h * src_step_base;
ushort *dst = (ushort*)(dst_data) + h;
int vl;
for (w = 0; w < src_width; w += vl) {
vl = __riscv_vsetvl_e16m1(src_width - w);
transpose_16u_8xVl(src + w, src_step_base, dst + w * dst_step_base, dst_step, vl);
}
}
for (; h < src_height; h++) {
const ushort *src = (const ushort*)(src_data) + h * src_step_base;
ushort *dst = (ushort*)(dst_data) + h;
int vl;
for (w = 0; w < src_width; w += vl) {
vl = __riscv_vsetvl_e16m8(src_width - w);
auto v = __riscv_vle16_v_u16m8(src + w, vl);
__riscv_vsse16(dst + w * dst_step_base, dst_step, v, vl);
}
}
}
static void transpose2d_32s(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
auto transpose_32s_4xVl = [](const int *src, size_t src_step, int *dst, size_t dst_step, const int vl) {
auto v0 = __riscv_vle32_v_i32m1(src, vl);
auto v1 = __riscv_vle32_v_i32m1(src + src_step, vl);
auto v2 = __riscv_vle32_v_i32m1(src + 2 * src_step, vl);
auto v3 = __riscv_vle32_v_i32m1(src + 3 * src_step, vl);
vint32m1x4_t v = __riscv_vcreate_v_i32m1x4(v0, v1, v2, v3);
__riscv_vssseg4e32(dst, dst_step, v, vl);
};
size_t src_step_base = src_step / sizeof(int);
size_t dst_step_base = dst_step / sizeof(int);
int h = 0, w = 0;
for (; h <= src_height - 4; h += 4) {
const int *src = (const int*)(src_data) + h * src_step_base;
int *dst = (int*)(dst_data) + h;
int vl;
for (w = 0; w < src_width; w += vl) {
vl = __riscv_vsetvl_e32m1(src_width - w);
transpose_32s_4xVl(src + w, src_step_base, dst + w * dst_step_base, dst_step, vl);
}
}
for (; h < src_height; h++) {
const int *src = (const int*)(src_data) + h * src_step_base;
int *dst = (int*)(dst_data) + h;
int vl;
for (w = 0; w < src_width; w += vl) {
vl = __riscv_vsetvl_e32m8(src_width - w);
auto v = __riscv_vle32_v_i32m8(src + w, vl);
__riscv_vsse32(dst + w * dst_step_base, dst_step, v, vl);
}
}
}
static void transpose2d_32sC2(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
auto transpose_64s_8xVl = [](const int64_t *src, size_t src_step, int64_t *dst, size_t dst_step, const int vl) {
auto v0 = __riscv_vle64_v_i64m1(src, vl);
auto v1 = __riscv_vle64_v_i64m1(src + src_step, vl);
auto v2 = __riscv_vle64_v_i64m1(src + 2 * src_step, vl);
auto v3 = __riscv_vle64_v_i64m1(src + 3 * src_step, vl);
auto v4 = __riscv_vle64_v_i64m1(src + 4 * src_step, vl);
auto v5 = __riscv_vle64_v_i64m1(src + 5 * src_step, vl);
auto v6 = __riscv_vle64_v_i64m1(src + 6 * src_step, vl);
auto v7 = __riscv_vle64_v_i64m1(src + 7 * src_step, vl);
vint64m1x8_t v = __riscv_vcreate_v_i64m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
__riscv_vssseg8e64(dst, dst_step, v, vl);
};
size_t src_step_base = src_step / sizeof(int64_t);
size_t dst_step_base = dst_step / sizeof(int64_t);
int h = 0, w = 0;
for (; h <= src_height - 8; h += 8) {
const int64_t *src = (const int64_t*)(src_data) + h * src_step_base;
int64_t *dst = (int64_t*)(dst_data) + h;
int vl;
for (w = 0; w < src_width; w += vl) {
vl = __riscv_vsetvl_e64m1(src_width - w);
transpose_64s_8xVl(src + w, src_step_base, dst + w * dst_step_base, dst_step, vl);
}
}
for (; h < src_height; h++) {
const int64_t *src = (const int64_t*)(src_data) + h * src_step_base;
int64_t *dst = (int64_t*)(dst_data) + h;
int vl;
for (w = 0; w < src_width; w += vl) {
vl = __riscv_vsetvl_e64m8(src_width - w);
auto v = __riscv_vle64_v_i64m8(src + w, vl);
__riscv_vsse64(dst + w * dst_step_base, dst_step, v, vl);
}
}
}
#undef cv_hal_transpose2d
#define cv_hal_transpose2d cv::cv_hal_rvv::transpose::transpose2d
using Transpose2dFunc = void (*)(const uchar*, size_t, uchar*, size_t, int, int);
inline int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
int src_width, int src_height, int element_size) {
if (src_data == dst_data) {
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
static Transpose2dFunc tab[] = {
0, transpose2d_8u, transpose2d_16u, 0,
transpose2d_32s, 0, 0, 0,
transpose2d_32sC2, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0
};
Transpose2dFunc func = tab[element_size];
if (!func) {
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
func(src_data, src_step, dst_data, dst_step, src_width, src_height);
return CV_HAL_ERROR_OK;
}
}}} // cv::cv_hal_rvv::transpose
#endif // OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED

View File

@ -422,6 +422,19 @@ PERF_TEST_P_(BinaryOpTest, reciprocal)
SANITY_CHECK_NOTHING();
}
PERF_TEST_P_(BinaryOpTest, transpose2d)
{
Size sz = get<0>(GetParam());
int type = get<1>(GetParam());
Size tsz = Size(sz.height, sz.width);
cv::Mat a(sz, type), b(tsz, type);;
declare.in(a, WARMUP_RNG).out(b);
TEST_CYCLE() cv::transpose(a, b);
SANITY_CHECK_NOTHING();
}
PERF_TEST_P_(BinaryOpTest, transposeND)
{