mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 06:26:29 +08:00

Fix RISC-V HAL solve/SVD and BGRtoLab #27046 Closes #27044. Also suppressed some warnings in other HAL. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
360 lines
12 KiB
C++
360 lines
12 KiB
C++
// This file is part of OpenCV project.
|
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
// of this distribution and at http://opencv.org/license.html.
|
|
|
|
// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
|
|
|
|
#ifndef OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
|
|
#define OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
|
|
|
|
#include <riscv_vector.h>
|
|
|
|
namespace cv { namespace cv_hal_rvv {
|
|
|
|
#undef cv_hal_merge8u
|
|
#define cv_hal_merge8u cv::cv_hal_rvv::merge8u
|
|
#undef cv_hal_merge16u
|
|
#define cv_hal_merge16u cv::cv_hal_rvv::merge16u
|
|
#undef cv_hal_merge32s
|
|
#define cv_hal_merge32s cv::cv_hal_rvv::merge32s
|
|
#undef cv_hal_merge64s
|
|
#define cv_hal_merge64s cv::cv_hal_rvv::merge64s
|
|
|
|
#if defined __clang__ && __clang_major__ < 18
|
|
#define OPENCV_HAL_IMPL_RVV_VCREATE_x2(suffix, width, v0, v1) \
|
|
__riscv_vset_v_##suffix##m##width##_##suffix##m##width##x2(seg, 0, v0); \
|
|
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x2(seg, 1, v1);
|
|
|
|
#define OPENCV_HAL_IMPL_RVV_VCREATE_x3(suffix, width, v0, v1, v2) \
|
|
__riscv_vset_v_##suffix##m##width##_##suffix##m##width##x3(seg, 0, v0); \
|
|
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x3(seg, 1, v1); \
|
|
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x3(seg, 2, v2);
|
|
|
|
#define OPENCV_HAL_IMPL_RVV_VCREATE_x4(suffix, width, v0, v1, v2, v3) \
|
|
__riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(seg, 0, v0); \
|
|
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(seg, 1, v1); \
|
|
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(seg, 2, v2); \
|
|
seg = __riscv_vset_v_##suffix##m##width##_##suffix##m##width##x4(seg, 3, v3);
|
|
|
|
#define __riscv_vcreate_v_u8m4x2(v0, v1) OPENCV_HAL_IMPL_RVV_VCREATE_x2(u8, 4, v0, v1)
|
|
#define __riscv_vcreate_v_u8m2x3(v0, v1, v2) OPENCV_HAL_IMPL_RVV_VCREATE_x3(u8, 2, v0, v1, v2)
|
|
#define __riscv_vcreate_v_u8m2x4(v0, v1, v2, v3) OPENCV_HAL_IMPL_RVV_VCREATE_x4(u8, 2, v0, v1, v2, v3)
|
|
#define __riscv_vcreate_v_u16m4x2(v0, v1) OPENCV_HAL_IMPL_RVV_VCREATE_x2(u16, 4, v0, v1)
|
|
#define __riscv_vcreate_v_u16m2x3(v0, v1, v2) OPENCV_HAL_IMPL_RVV_VCREATE_x3(u16, 2, v0, v1, v2)
|
|
#define __riscv_vcreate_v_u16m2x4(v0, v1, v2, v3) OPENCV_HAL_IMPL_RVV_VCREATE_x4(u16, 2, v0, v1, v2, v3)
|
|
#endif // clang < 18
|
|
|
|
inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
|
|
int vl = 0;
|
|
if (cn == 1)
|
|
{
|
|
const uchar* src0 = src[0];
|
|
for (int i = 0; i < len; i += vl)
|
|
{
|
|
vl = __riscv_vsetvl_e8m8(len - i);
|
|
__riscv_vse8_v_u8m8(dst + i, __riscv_vle8_v_u8m8(src0 + i, vl), vl);
|
|
}
|
|
}
|
|
else if (cn == 2)
|
|
{
|
|
const uchar *src0 = src[0], *src1 = src[1];
|
|
for (int i = 0; i < len; i += vl)
|
|
{
|
|
vl = __riscv_vsetvl_e8m4(len - i);
|
|
vuint8m4x2_t seg = __riscv_vcreate_v_u8m4x2(
|
|
__riscv_vle8_v_u8m4(src0 + i, vl),
|
|
__riscv_vle8_v_u8m4(src1 + i, vl)
|
|
);
|
|
__riscv_vsseg2e8_v_u8m4x2(dst + i * cn, seg, vl);
|
|
}
|
|
}
|
|
else if (cn == 3)
|
|
{
|
|
const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2];
|
|
for (int i = 0; i < len; i += vl)
|
|
{
|
|
vl = __riscv_vsetvl_e8m2(len - i);
|
|
vuint8m2x3_t seg = __riscv_vcreate_v_u8m2x3(
|
|
__riscv_vle8_v_u8m2(src0 + i, vl),
|
|
__riscv_vle8_v_u8m2(src1 + i, vl),
|
|
__riscv_vle8_v_u8m2(src2 + i, vl)
|
|
);
|
|
__riscv_vsseg3e8_v_u8m2x3(dst + i * cn, seg, vl);
|
|
}
|
|
}
|
|
else if (cn == 4)
|
|
{
|
|
const uchar *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
|
for (int i = 0; i < len; i += vl)
|
|
{
|
|
vl = __riscv_vsetvl_e8m2(len - i);
|
|
vuint8m2x4_t seg = __riscv_vcreate_v_u8m2x4(
|
|
__riscv_vle8_v_u8m2(src0 + i, vl),
|
|
__riscv_vle8_v_u8m2(src1 + i, vl),
|
|
__riscv_vle8_v_u8m2(src2 + i, vl),
|
|
__riscv_vle8_v_u8m2(src3 + i, vl)
|
|
);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst + i * cn, seg, vl);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
int k = 0;
|
|
for (; k <= cn - 4; k += 4)
|
|
{
|
|
const uchar *src0 = src[k], *src1 = src[k + 1], *src2 = src[k + 2], *src3 = src[k + 3];
|
|
for (int i = 0; i < len; i += vl)
|
|
{
|
|
vl = __riscv_vsetvl_e8m2(len - i);
|
|
vuint8m2x4_t seg = __riscv_vcreate_v_u8m2x4(
|
|
__riscv_vle8_v_u8m2(src0 + i, vl),
|
|
__riscv_vle8_v_u8m2(src1 + i, vl),
|
|
__riscv_vle8_v_u8m2(src2 + i, vl),
|
|
__riscv_vle8_v_u8m2(src3 + i, vl)
|
|
);
|
|
__riscv_vssseg4e8_v_u8m2x4(dst + k + i * cn, cn, seg, vl);
|
|
}
|
|
}
|
|
for (; k < cn; ++k)
|
|
{
|
|
const uchar* srcK = src[k];
|
|
for (int i = 0; i < len; i += vl)
|
|
{
|
|
vl = __riscv_vsetvl_e8m2(len - i);
|
|
vuint8m2_t seg = __riscv_vle8_v_u8m2(srcK + i, vl);
|
|
__riscv_vsse8_v_u8m2(dst + k + i * cn, cn, seg, vl);
|
|
}
|
|
}
|
|
}
|
|
return CV_HAL_ERROR_OK;
|
|
}
|
|
|
|
inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
|
|
int vl = 0;
|
|
if (cn == 1)
|
|
{
|
|
const ushort* src0 = src[0];
|
|
for (int i = 0; i < len; i += vl)
|
|
{
|
|
vl = __riscv_vsetvl_e16m8(len - i);
|
|
__riscv_vse16_v_u16m8(dst + i, __riscv_vle16_v_u16m8(src0 + i, vl), vl);
|
|
}
|
|
}
|
|
else if (cn == 2)
|
|
{
|
|
const ushort *src0 = src[0], *src1 = src[1];
|
|
for (int i = 0; i < len; i += vl)
|
|
{
|
|
vl = __riscv_vsetvl_e16m4(len - i);
|
|
vuint16m4x2_t seg = __riscv_vcreate_v_u16m4x2(
|
|
__riscv_vle16_v_u16m4(src0 + i, vl),
|
|
__riscv_vle16_v_u16m4(src1 + i, vl)
|
|
);
|
|
__riscv_vsseg2e16_v_u16m4x2(dst + i * cn, seg, vl);
|
|
}
|
|
}
|
|
else if (cn == 3)
|
|
{
|
|
const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2];
|
|
for (int i = 0; i < len; i += vl)
|
|
{
|
|
vl = __riscv_vsetvl_e16m2(len - i);
|
|
vuint16m2x3_t seg = __riscv_vcreate_v_u16m2x3(
|
|
__riscv_vle16_v_u16m2(src0 + i, vl),
|
|
__riscv_vle16_v_u16m2(src1 + i, vl),
|
|
__riscv_vle16_v_u16m2(src2 + i, vl)
|
|
);
|
|
__riscv_vsseg3e16_v_u16m2x3(dst + i * cn, seg, vl);
|
|
}
|
|
}
|
|
else if (cn == 4)
|
|
{
|
|
const ushort *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
|
for (int i = 0; i < len; i += vl)
|
|
{
|
|
vl = __riscv_vsetvl_e16m2(len - i);
|
|
vuint16m2x4_t seg = __riscv_vcreate_v_u16m2x4(
|
|
__riscv_vle16_v_u16m2(src0 + i, vl),
|
|
__riscv_vle16_v_u16m2(src1 + i, vl),
|
|
__riscv_vle16_v_u16m2(src2 + i, vl),
|
|
__riscv_vle16_v_u16m2(src3 + i, vl)
|
|
);
|
|
__riscv_vsseg4e16_v_u16m2x4(dst + i * cn, seg, vl);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
int k = 0;
|
|
for (; k <= cn - 4; k += 4)
|
|
{
|
|
const ushort *src0 = src[k], *src1 = src[k + 1], *src2 = src[k + 2], *src3 = src[k + 3];
|
|
for (int i = 0; i < len; i += vl)
|
|
{
|
|
vl = __riscv_vsetvl_e16m2(len - i);
|
|
vuint16m2x4_t seg = __riscv_vcreate_v_u16m2x4(
|
|
__riscv_vle16_v_u16m2(src0 + i, vl),
|
|
__riscv_vle16_v_u16m2(src1 + i, vl),
|
|
__riscv_vle16_v_u16m2(src2 + i, vl),
|
|
__riscv_vle16_v_u16m2(src3 + i, vl)
|
|
);
|
|
__riscv_vssseg4e16_v_u16m2x4(dst + k + i * cn, cn * sizeof(ushort), seg, vl);
|
|
}
|
|
}
|
|
for (; k < cn; ++k)
|
|
{
|
|
const ushort* srcK = src[k];
|
|
for (int i = 0; i < len; i += vl)
|
|
{
|
|
vl = __riscv_vsetvl_e16m2(len - i);
|
|
vuint16m2_t seg = __riscv_vle16_v_u16m2(srcK + i, vl);
|
|
__riscv_vsse16_v_u16m2(dst + k + i * cn, cn * sizeof(ushort), seg, vl);
|
|
}
|
|
}
|
|
}
|
|
return CV_HAL_ERROR_OK;
|
|
}
|
|
|
|
#if defined __GNUC__ && !defined(__clang__)
|
|
__attribute__((optimize("no-tree-vectorize")))
|
|
#endif
|
|
inline int merge32s(const int** src, int* dst, int len, int cn ) {
|
|
int k = cn % 4 ? cn % 4 : 4;
|
|
int i, j;
|
|
if( k == 1 )
|
|
{
|
|
const int* src0 = src[0];
|
|
#if defined(__clang__)
|
|
#pragma clang loop vectorize(disable)
|
|
#endif
|
|
for( i = j = 0; i < len; i++, j += cn )
|
|
dst[j] = src0[i];
|
|
}
|
|
else if( k == 2 )
|
|
{
|
|
const int *src0 = src[0], *src1 = src[1];
|
|
i = j = 0;
|
|
#if defined(__clang__)
|
|
#pragma clang loop vectorize(disable)
|
|
#endif
|
|
for( ; i < len; i++, j += cn )
|
|
{
|
|
dst[j] = src0[i];
|
|
dst[j+1] = src1[i];
|
|
}
|
|
}
|
|
else if( k == 3 )
|
|
{
|
|
const int *src0 = src[0], *src1 = src[1], *src2 = src[2];
|
|
i = j = 0;
|
|
#if defined(__clang__)
|
|
#pragma clang loop vectorize(disable)
|
|
#endif
|
|
for( ; i < len; i++, j += cn )
|
|
{
|
|
dst[j] = src0[i];
|
|
dst[j+1] = src1[i];
|
|
dst[j+2] = src2[i];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
const int *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
|
i = j = 0;
|
|
#if defined(__clang__)
|
|
#pragma clang loop vectorize(disable)
|
|
#endif
|
|
for( ; i < len; i++, j += cn )
|
|
{
|
|
dst[j] = src0[i]; dst[j+1] = src1[i];
|
|
dst[j+2] = src2[i]; dst[j+3] = src3[i];
|
|
}
|
|
}
|
|
#if defined(__clang__)
|
|
#pragma clang loop vectorize(disable)
|
|
#endif
|
|
for( ; k < cn; k += 4 )
|
|
{
|
|
const int *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
|
|
for( i = 0, j = k; i < len; i++, j += cn )
|
|
{
|
|
dst[j] = src0[i]; dst[j+1] = src1[i];
|
|
dst[j+2] = src2[i]; dst[j+3] = src3[i];
|
|
}
|
|
}
|
|
return CV_HAL_ERROR_OK;
|
|
}
|
|
|
|
#if defined __GNUC__ && !defined(__clang__)
|
|
__attribute__((optimize("no-tree-vectorize")))
|
|
#endif
|
|
inline int merge64s(const int64** src, int64* dst, int len, int cn ) {
|
|
int k = cn % 4 ? cn % 4 : 4;
|
|
int i, j;
|
|
if( k == 1 )
|
|
{
|
|
const int64* src0 = src[0];
|
|
#if defined(__clang__)
|
|
#pragma clang loop vectorize(disable)
|
|
#endif
|
|
for( i = j = 0; i < len; i++, j += cn )
|
|
dst[j] = src0[i];
|
|
}
|
|
else if( k == 2 )
|
|
{
|
|
const int64 *src0 = src[0], *src1 = src[1];
|
|
i = j = 0;
|
|
#if defined(__clang__)
|
|
#pragma clang loop vectorize(disable)
|
|
#endif
|
|
for( ; i < len; i++, j += cn )
|
|
{
|
|
dst[j] = src0[i];
|
|
dst[j+1] = src1[i];
|
|
}
|
|
}
|
|
else if( k == 3 )
|
|
{
|
|
const int64 *src0 = src[0], *src1 = src[1], *src2 = src[2];
|
|
i = j = 0;
|
|
#if defined(__clang__)
|
|
#pragma clang loop vectorize(disable)
|
|
#endif
|
|
for( ; i < len; i++, j += cn )
|
|
{
|
|
dst[j] = src0[i];
|
|
dst[j+1] = src1[i];
|
|
dst[j+2] = src2[i];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
const int64 *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
|
|
i = j = 0;
|
|
#if defined(__clang__)
|
|
#pragma clang loop vectorize(disable)
|
|
#endif
|
|
for( ; i < len; i++, j += cn )
|
|
{
|
|
dst[j] = src0[i]; dst[j+1] = src1[i];
|
|
dst[j+2] = src2[i]; dst[j+3] = src3[i];
|
|
}
|
|
}
|
|
#if defined(__clang__)
|
|
#pragma clang loop vectorize(disable)
|
|
#endif
|
|
for( ; k < cn; k += 4 )
|
|
{
|
|
const int64 *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
|
|
for( i = 0, j = k; i < len; i++, j += cn )
|
|
{
|
|
dst[j] = src0[i]; dst[j+1] = src1[i];
|
|
dst[j+2] = src2[i]; dst[j+3] = src3[i];
|
|
}
|
|
}
|
|
return CV_HAL_ERROR_OK;
|
|
}
|
|
|
|
}}
|
|
|
|
#endif
|