Merge pull request #26943 from GenshinImpactStarts:flip_hal_rvv

Impl RISC-V HAL for cv::flip | Add perf test for flip #26943 

Implement through the existing `cv_hal_flip` interfaces.

Add perf test for `cv::flip`.

The reason why select these args for testing:
- **size**: copied from perf_lut
- **type**:
    - U8C1: basic situation
    - U8C3: unaligned element size
    - U8C4: large element size

Tested on
- MUSE-PI (vlen=256)
- Compiler: gcc 14.2 (riscv-collab/riscv-gnu-toolchain Nightly: December 16, 2024)

```sh
$ opencv_test_core --gtest_filter="Core_Flip/ElemWiseTest.*"
$ opencv_perf_core --gtest_filter="Size_MatType_FlipCode*" --perf_min_samples=300 --perf_force_samples=300
```

```
Geometric mean (ms)

                     Name of Test                       scalar   ui    rvv       ui        rvv    
                                                                                 vs         vs    
                                                                               scalar     scalar  
                                                                             (x-factor) (x-factor)
flip::Size_MatType_FlipCode::(320x240, 8UC1, FLIP_X)    0.026  0.033  0.031     0.81       0.84   
flip::Size_MatType_FlipCode::(320x240, 8UC1, FLIP_XY)   0.206  0.212  0.091     0.97       2.26   
flip::Size_MatType_FlipCode::(320x240, 8UC1, FLIP_Y)    0.185  0.189  0.082     0.98       2.25   
flip::Size_MatType_FlipCode::(320x240, 8UC3, FLIP_X)    0.070  0.084  0.084     0.83       0.83   
flip::Size_MatType_FlipCode::(320x240, 8UC3, FLIP_XY)   0.616  0.612  0.235     1.01       2.62   
flip::Size_MatType_FlipCode::(320x240, 8UC3, FLIP_Y)    0.587  0.603  0.204     0.97       2.88   
flip::Size_MatType_FlipCode::(320x240, 8UC4, FLIP_X)    0.263  0.110  0.109     2.40       2.41   
flip::Size_MatType_FlipCode::(320x240, 8UC4, FLIP_XY)   0.930  0.831  0.316     1.12       2.95   
flip::Size_MatType_FlipCode::(320x240, 8UC4, FLIP_Y)    1.175  1.129  0.313     1.04       3.75   
flip::Size_MatType_FlipCode::(640x480, 8UC1, FLIP_X)    0.303  0.118  0.111     2.57       2.73   
flip::Size_MatType_FlipCode::(640x480, 8UC1, FLIP_XY)   0.949  0.836  0.405     1.14       2.34   
flip::Size_MatType_FlipCode::(640x480, 8UC1, FLIP_Y)    0.784  0.783  0.409     1.00       1.92   
flip::Size_MatType_FlipCode::(640x480, 8UC3, FLIP_X)    1.084  0.360  0.355     3.01       3.06   
flip::Size_MatType_FlipCode::(640x480, 8UC3, FLIP_XY)   3.768  3.348  1.364     1.13       2.76   
flip::Size_MatType_FlipCode::(640x480, 8UC3, FLIP_Y)    4.361  4.473  1.296     0.97       3.37   
flip::Size_MatType_FlipCode::(640x480, 8UC4, FLIP_X)    1.252  0.469  0.451     2.67       2.78   
flip::Size_MatType_FlipCode::(640x480, 8UC4, FLIP_XY)   5.732  5.220  1.303     1.10       4.40   
flip::Size_MatType_FlipCode::(640x480, 8UC4, FLIP_Y)    5.041  5.105  1.203     0.99       4.19   
flip::Size_MatType_FlipCode::(1920x1080, 8UC1, FLIP_X)  2.382  0.903  0.903     2.64       2.64   
flip::Size_MatType_FlipCode::(1920x1080, 8UC1, FLIP_XY) 8.606  7.508  2.581     1.15       3.33   
flip::Size_MatType_FlipCode::(1920x1080, 8UC1, FLIP_Y)  8.421  8.535  2.219     0.99       3.80   
flip::Size_MatType_FlipCode::(1920x1080, 8UC3, FLIP_X)  6.312  2.416  2.429     2.61       2.60   
flip::Size_MatType_FlipCode::(1920x1080, 8UC3, FLIP_XY) 29.174 26.055 12.761    1.12       2.29   
flip::Size_MatType_FlipCode::(1920x1080, 8UC3, FLIP_Y)  25.373 25.500 13.382    1.00       1.90   
flip::Size_MatType_FlipCode::(1920x1080, 8UC4, FLIP_X)  7.620  3.204  3.115     2.38       2.45   
flip::Size_MatType_FlipCode::(1920x1080, 8UC4, FLIP_XY) 32.876 29.310 12.976    1.12       2.53   
flip::Size_MatType_FlipCode::(1920x1080, 8UC4, FLIP_Y)  28.831 29.094 14.919    0.99       1.93   
```

The optimization for vlen <= 256 and > 256 are different, but I have no real hardware with vlen > 256. So accuracy tests for that like 512 and 1024 are conducted on QEMU built from the `riscv-collab/riscv-gnu-toolchain`.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
GenshinImpactStarts 2025-02-24 13:56:23 +08:00 committed by GitHub
parent b5f5540e8a
commit 6a6a5a765d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 294 additions and 0 deletions

View File

@ -28,6 +28,7 @@
#include "hal_rvv_1p0/minmax.hpp" // core #include "hal_rvv_1p0/minmax.hpp" // core
#include "hal_rvv_1p0/atan.hpp" // core #include "hal_rvv_1p0/atan.hpp" // core
#include "hal_rvv_1p0/split.hpp" // core #include "hal_rvv_1p0/split.hpp" // core
#include "hal_rvv_1p0/flip.hpp" // core
#endif #endif
#endif #endif

248
3rdparty/hal_rvv/hal_rvv_1p0/flip.hpp vendored Normal file
View File

@ -0,0 +1,248 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#pragma once
#include <riscv_vector.h>
#include <opencv2/core/base.hpp>
#include <opencv2/core/utility.hpp>
namespace cv { namespace cv_hal_rvv {
#undef cv_hal_flip
#define cv_hal_flip cv::cv_hal_rvv::flip
struct FlipVlen256
{
using TableElemType = uchar;
using TableType = vuint8m8_t;
static inline size_t setvlmax()
{
return __riscv_vsetvlmax_e8m8();
}
static inline TableType vid(size_t vl)
{
return __riscv_vid_v_u8m8(vl);
}
static inline TableType loadTable(const TableElemType* ptr, size_t vl)
{
return __riscv_vle8_v_u8m8(ptr, vl);
}
static inline void gather(const uchar* src, TableType tab, uchar* dst, size_t vl)
{
auto v = __riscv_vle8_v_u8m8(src, vl);
__riscv_vse8(dst, __riscv_vrgather(v, tab, vl), vl);
}
};
struct FlipVlen512
{
using TableElemType = uint16_t;
using TableType = vuint16m8_t;
static inline size_t setvlmax()
{
return __riscv_vsetvlmax_e8m4();
}
static inline TableType vid(size_t vl)
{
return __riscv_vid_v_u16m8(vl);
}
static inline TableType loadTable(const TableElemType* ptr, size_t vl)
{
return __riscv_vle16_v_u16m8(ptr, vl);
}
static inline void gather(const uchar* src, TableType tab, uchar* dst, size_t vl)
{
auto v = __riscv_vle8_v_u8m4(src, vl);
__riscv_vse8(dst, __riscv_vrgatherei16(v, tab, vl), vl);
}
};
template <typename T>
inline void flipFillBuffer(cv::AutoBuffer<T>& _buf, size_t len, int esz)
{
T* buf = _buf.data();
for (int i = (int)len - esz; i >= 0; i -= esz, buf += esz)
for (int j = 0; j < esz; j++)
buf[j] = (T)(i + j);
}
inline void flipX(int esz,
const uchar* src_data,
size_t src_step,
int src_width,
int src_height,
uchar* dst_data,
size_t dst_step)
{
size_t w = (size_t)src_width * esz;
auto src0 = src_data, src1 = src_data + src_step * (src_height - 1);
auto dst0 = dst_data, dst1 = dst_data + dst_step * (src_height - 1);
size_t vl;
for (src_height -= 2; src_height >= 0;
src_height -= 2, src0 += src_step, dst0 += dst_step, src1 -= src_step, dst1 -= dst_step)
{
for (size_t i = 0; i < w; i += vl)
{
vl = __riscv_vsetvl_e8m8(w - i);
__riscv_vse8(dst1 + i, __riscv_vle8_v_u8m8(src0 + i, vl), vl);
__riscv_vse8(dst0 + i, __riscv_vle8_v_u8m8(src1 + i, vl), vl);
}
}
if (src_height == -1)
{
for (size_t i = 0; i < w; i += (int)vl)
{
vl = __riscv_vsetvl_e8m8(w - i);
__riscv_vse8(dst0 + i, __riscv_vle8_v_u8m8(src1 + i, vl), vl);
}
}
}
template <typename FlipVlen>
inline void flipY(int esz,
const uchar* src_data,
size_t src_step,
int src_width,
int src_height,
uchar* dst_data,
size_t dst_step)
{
size_t w = (size_t)src_width * esz;
size_t vl = std::min(FlipVlen::setvlmax() / esz * esz, w);
typename FlipVlen::TableType tab_v;
if (esz == 1)
tab_v = __riscv_vrsub(FlipVlen::vid(vl), vl - 1, vl);
else
{
cv::AutoBuffer<typename FlipVlen::TableElemType> buf(vl);
flipFillBuffer(buf, vl, esz);
tab_v = FlipVlen::loadTable(buf.data(), vl);
}
if (vl == w)
for (; src_height; src_height--, src_data += src_step, dst_data += dst_step)
FlipVlen::gather(src_data, tab_v, dst_data, vl);
else
for (; src_height; src_height--, src_data += src_step, dst_data += dst_step)
{
auto src0 = src_data, src1 = src_data + w - vl;
auto dst0 = dst_data, dst1 = dst_data + w - vl;
for (; src0 < src1 + vl; src0 += vl, src1 -= vl, dst0 += vl, dst1 -= vl)
{
FlipVlen::gather(src0, tab_v, dst1, vl);
FlipVlen::gather(src1, tab_v, dst0, vl);
}
}
}
template <typename FlipVlen>
inline void flipXY(int esz,
const uchar* src_data,
size_t src_step,
int src_width,
int src_height,
uchar* dst_data,
size_t dst_step)
{
size_t w = (size_t)src_width * esz;
size_t vl = std::min(FlipVlen::setvlmax() / esz * esz, w);
typename FlipVlen::TableType tab_v;
if (esz == 1)
tab_v = __riscv_vrsub(FlipVlen::vid(vl), vl - 1, vl);
else
{
cv::AutoBuffer<typename FlipVlen::TableElemType> buf(vl);
flipFillBuffer(buf, vl, esz);
tab_v = FlipVlen::loadTable(buf.data(), vl);
}
auto src0 = src_data, src1 = src_data + src_step * (src_height - 1);
auto dst0 = dst_data, dst1 = dst_data + dst_step * (src_height - 1);
if (vl == w)
{
for (src_height -= 2; src_height >= 0;
src_height -= 2,
src0 += src_step,
dst0 += dst_step,
src1 -= src_step,
dst1 -= dst_step)
{
FlipVlen::gather(src0, tab_v, dst1, vl);
FlipVlen::gather(src1, tab_v, dst0, vl);
}
if (src_height == -1)
{
FlipVlen::gather(src1, tab_v, dst0, vl);
}
}
else
{
for (src_height -= 2; src_height >= 0;
src_height -= 2,
src0 += src_step,
dst0 += dst_step,
src1 -= src_step,
dst1 -= dst_step)
{
for (size_t i = 0; 2 * i < w; i += vl)
{
FlipVlen::gather(src0 + i, tab_v, dst1 + w - i - vl, vl);
FlipVlen::gather(src0 + w - i - vl, tab_v, dst1 + i, vl);
FlipVlen::gather(src1 + i, tab_v, dst0 + w - i - vl, vl);
FlipVlen::gather(src1 + w - i - vl, tab_v, dst0 + i, vl);
}
}
if (src_height == -1)
{
for (size_t i = 0; 2 * i < w; i += vl)
{
FlipVlen::gather(src1 + i, tab_v, dst0 + w - i - vl, vl);
FlipVlen::gather(src1 + w - i - vl, tab_v, dst0 + i, vl);
}
}
}
}
inline int flip(int src_type,
const uchar* src_data,
size_t src_step,
int src_width,
int src_height,
uchar* dst_data,
size_t dst_step,
int flip_mode)
{
if (src_width < 0 || src_height < 0 || src_data == dst_data)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
int esz = CV_ELEM_SIZE(src_type);
if (flip_mode == 0)
{
flipX(esz, src_data, src_step, src_width, src_height, dst_data, dst_step);
}
else if (flip_mode > 0)
{
if (__riscv_vlenb() * 8 <= 256)
flipY<FlipVlen256>(esz, src_data, src_step, src_width, src_height, dst_data, dst_step);
else
flipY<FlipVlen512>(esz, src_data, src_step, src_width, src_height, dst_data, dst_step);
}
else
{
if (__riscv_vlenb() * 8 <= 256)
flipXY<FlipVlen256>(esz, src_data, src_step, src_width, src_height, dst_data, dst_step);
else
flipXY<FlipVlen512>(esz, src_data, src_step, src_width, src_height, dst_data, dst_step);
}
return CV_HAL_ERROR_OK;
}
}} // namespace cv::cv_hal_rvv

View File

@ -0,0 +1,45 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "perf_precomp.hpp"
namespace opencv_test { namespace {
using namespace perf;
enum
{
FLIP_XY = 0,
FLIP_X = 1,
FLIP_Y = 2,
};
#define FLIP_SIZES szQVGA, szVGA, sz1080p
#define FLIP_TYPES CV_8UC1, CV_8UC3, CV_8UC4
#define FLIP_CODES FLIP_X, FLIP_Y, FLIP_XY
CV_FLAGS(FlipCode, FLIP_X, FLIP_Y, FLIP_XY);
typedef tuple<Size, MatType, FlipCode> Size_MatType_FlipCode_t;
typedef perf::TestBaseWithParam<Size_MatType_FlipCode_t> Size_MatType_FlipCode;
PERF_TEST_P(Size_MatType_FlipCode,
flip,
testing::Combine(testing::Values(FLIP_SIZES),
testing::Values(FLIP_TYPES),
testing::Values(FLIP_CODES)))
{
Size sz = get<0>(GetParam());
int matType = get<1>(GetParam());
int flipCode = get<2>(GetParam()) - 1;
Mat src(sz, matType);
Mat dst(sz, matType);
declare.in(src, WARMUP_RNG).out(dst);
TEST_CYCLE() cv::flip(src, dst, flipCode);
SANITY_CHECK_NOTHING();
}
}} // namespace opencv_test