Merge pull request #25883 from hanliutong:rvv-intrin-upgrade

Upgrade RISC-V Vector intrinsic and cleanup the obsolete RVV backend. #25883

This patch upgrade RISC-V Vector intrinsic from `v0.10` to `v0.12`/`v1.0`:
- Update cmake check and options;
- Upgrade RVV implement for Universal Intrinsic;
- Upgrade RVV optimized DNN kernel.
- Cleanup the obsolete RVV backend (`intrin_rvv.hpp`) and compatable header file.

With this patch, RVV backend require Clang 17+ or GCC 14+ (which means `__riscv_v_intrinsic >= 12000`, see https://godbolt.org/z/es7ncETE3)

This patch is test with Clang 17.0.6 (require extra `-DWITH_PNG=OFF` due to ICE), Clang 18.1.8 and GCC 14.1.0 on QEMU and k230 (with `--gtest_filter="*hal_*"`).

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
HAN Liutong 2024-07-19 16:41:42 +08:00 committed by GitHub
parent 4dd54bbec9
commit b5ea32158a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 780 additions and 29521 deletions

View File

@ -390,14 +390,9 @@ elseif(PPC64LE)
set(CPU_BASELINE "VSX" CACHE STRING "${HELP_CPU_BASELINE}")
elseif(RISCV)
option(RISCV_RVV_SCALABLE "Use scalable RVV API on RISC-V" ON)
ocv_update(CPU_RVV_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_rvv.cpp")
ocv_update(CPU_KNOWN_OPTIMIZATIONS "RVV")
ocv_update(CPU_RVV_FLAGS_ON "-march=rv64gcv")
if(RISCV_RVV_SCALABLE)
set(CPU_RVV_FLAGS_ON "${CPU_RVV_FLAGS_ON} -DCV_RVV_SCALABLE")
endif()
ocv_update(CPU_RVV_FLAGS_CONFLICT "-march=[^ ]*")
set(CPU_DISPATCH "" CACHE STRING "${HELP_CPU_DISPATCH}")

View File

@ -1,17 +1,16 @@
#include <stdio.h>
#if defined(__riscv)
# include <riscv_vector.h>
# define CV_RVV 1
#if !defined(__riscv) || !defined(__riscv_v)
#error "RISC-V or vector extension(RVV) is not supported by the compiler"
#endif
#if defined CV_RVV
#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>10999
#define vreinterpret_v_u64m1_u8m1 __riscv_vreinterpret_v_u64m1_u8m1
#define vle64_v_u64m1 __riscv_vle64_v_u64m1
#define vle32_v_f32m1 __riscv_vle32_v_f32m1
#define vfmv_f_s_f32m1_f32 __riscv_vfmv_f_s_f32m1_f32
#if !defined(__THEAD_VERSION__) && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic < 12000
#error "Wrong intrinsics version, v0.12 or higher is required for gcc or clang"
#endif
#include <riscv_vector.h>
#ifdef __THEAD_VERSION__
int test()
{
const float src[] = { 0.0f, 0.0f, 0.0f, 0.0f };
@ -21,7 +20,14 @@ int test()
return (int)vfmv_f_s_f32m1_f32(val);
}
#else
#error "RISC-V vector extension(RVV) is not supported"
int test()
{
const float src[] = { 0.0f, 0.0f, 0.0f, 0.0f };
uint64_t ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
vuint8m1_t a = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(ptr, 2));
vfloat32m1_t val = __riscv_vle32_v_f32m1((const float*)(src), 4);
return (int)__riscv_vfmv_f_s_f32m1_f32(val);
}
#endif
int main()

View File

@ -146,9 +146,23 @@
# define CV_NEON 1
#endif
#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_vector_071)
# include<riscv_vector.h>
# define CV_RVV071 1
/* RVV-related macro states with different compiler
// +--------------------+----------+----------+
// | Macro | Upstream | XuanTie |
// +--------------------+----------+----------+
// | CV_CPU_COMPILE_RVV | defined | defined |
// | CV_RVV | 1 | 0 |
// | CV_RVV071 | 0 | 1 |
// | CV_TRY_RVV | 1 | 1 |
// +--------------------+----------+----------+
*/
#ifdef CV_CPU_COMPILE_RVV
# ifdef __riscv_vector_071
# define CV_RVV071 1
# else
# define CV_RVV 1
# endif
#include <riscv_vector.h>
#endif
#ifdef CV_CPU_COMPILE_VSX
@ -183,11 +197,6 @@
# include <wasm_simd128.h>
#endif
#if defined CV_CPU_COMPILE_RVV
# define CV_RVV 1
# include <riscv_vector.h>
#endif
#endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
#if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX

View File

@ -236,11 +236,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#include "opencv2/core/hal/intrin_wasm.hpp"
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
#if defined(CV_RVV_SCALABLE)
#include "opencv2/core/hal/intrin_rvv_scalable.hpp"
#else
#include "opencv2/core/hal/intrin_rvv.hpp"
#endif
#elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP)

File diff suppressed because it is too large Load Diff

View File

@ -1,768 +0,0 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// Copied from
// https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/auto-generated/rvv-v0p10-compatible-headers
#ifndef __RVV_0P10_COMPATIBLE_HEADERS_OVERLOADED_NON_POLICY_H
#define __RVV_0P10_COMPATIBLE_HEADERS_OVERLOADED_NON_POLICY_H
// The maximum number of parameters is 20, this is held by segment load
// instructions with a NFIELD (NF) of 8. 20 is contributed by 8 vector register
// pointers passed, 1 vector mask register, 8 passthrough register for
// undisturbed policy, and 3 for address base, byte index, vl.
#define _GET_OVERRIDE(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13,\
_14, _15, _16, _17, _18, _19, _20, NAME, ...) NAME
#if __has_include ("riscv_vector.h")
#include <riscv_vector.h>
#endif
#ifndef __RISCV_VECTOR_H
#include_next <riscv_vector.h>
#endif
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
#define vmerge(mask, op1, op2, vl) __riscv_vmerge((op1), (op2), (mask), (vl))
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
#define vfmerge(mask, op1, op2, vl) __riscv_vfmerge((op1), (op2), (mask), (vl))
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
// masked functions
#define vcompress(mask, dest, src, vl) __riscv_vcompress_tu((dest), (src), (mask), (vl))
// Reinterpret between different type under the same SEW/LMUL
// Reinterpret between different SEW under the same LMUL
#define vse16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse16, __riscv_vse16, 2, 1)(__VA_ARGS__)
#define vse32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse32, __riscv_vse32, 2, 1)(__VA_ARGS__)
#define vse64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse64, __riscv_vse64, 2, 1)(__VA_ARGS__)
#define vse8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse8, __riscv_vse8, 2, 1)(__VA_ARGS__)
#define vsse16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse16, __riscv_vsse16, 3, 2, 1)(__VA_ARGS__)
#define vsse32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse32, __riscv_vsse32, 3, 2, 1)(__VA_ARGS__)
#define vsse64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse64, __riscv_vsse64, 3, 2, 1)(__VA_ARGS__)
#define vsse8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse8, __riscv_vsse8, 3, 2, 1)(__VA_ARGS__)
#define vloxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei8_tumu, 4, __riscv_vloxei8, 2, 1)(__VA_ARGS__)
#define vloxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei16_tumu, 4, __riscv_vloxei16, 2, 1)(__VA_ARGS__)
#define vloxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei32_tumu, 4, __riscv_vloxei32, 2, 1)(__VA_ARGS__)
#define vloxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei64_tumu, 4, __riscv_vloxei64, 2, 1)(__VA_ARGS__)
#define vluxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei8_tumu, 4, __riscv_vluxei8, 2, 1)(__VA_ARGS__)
#define vluxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei16_tumu, 4, __riscv_vluxei16, 2, 1)(__VA_ARGS__)
#define vluxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei32_tumu, 4, __riscv_vluxei32, 2, 1)(__VA_ARGS__)
#define vluxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei64_tumu, 4, __riscv_vluxei64, 2, 1)(__VA_ARGS__)
#define vsoxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei8, __riscv_vsoxei8, 3, 2, 1)(__VA_ARGS__)
#define vsoxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei16, __riscv_vsoxei16, 3, 2, 1)(__VA_ARGS__)
#define vsoxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei32, __riscv_vsoxei32, 3, 2, 1)(__VA_ARGS__)
#define vsoxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei64, __riscv_vsoxei64, 3, 2, 1)(__VA_ARGS__)
#define vsuxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei8, __riscv_vsuxei8, 3, 2, 1)(__VA_ARGS__)
#define vsuxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei16, __riscv_vsuxei16, 3, 2, 1)(__VA_ARGS__)
#define vsuxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei32, __riscv_vsuxei32, 3, 2, 1)(__VA_ARGS__)
#define vsuxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei64, __riscv_vsuxei64, 3, 2, 1)(__VA_ARGS__)
#define vsseg2e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e16, __riscv_vsseg2e16, 3, 2, 1)(__VA_ARGS__)
#define vsseg3e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e16, __riscv_vsseg3e16, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg4e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e16, __riscv_vsseg4e16, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg5e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e16, __riscv_vsseg5e16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg6e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e16, __riscv_vsseg6e16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg7e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e16, __riscv_vsseg7e16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg8e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e16, __riscv_vsseg8e16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg2e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e32, __riscv_vsseg2e32, 3, 2, 1)(__VA_ARGS__)
#define vsseg3e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e32, __riscv_vsseg3e32, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg4e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e32, __riscv_vsseg4e32, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg5e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e32, __riscv_vsseg5e32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg6e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e32, __riscv_vsseg6e32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg7e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e32, __riscv_vsseg7e32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg8e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e32, __riscv_vsseg8e32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg2e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e64, __riscv_vsseg2e64, 3, 2, 1)(__VA_ARGS__)
#define vsseg3e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e64, __riscv_vsseg3e64, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg4e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e64, __riscv_vsseg4e64, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg5e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e64, __riscv_vsseg5e64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg6e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e64, __riscv_vsseg6e64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg7e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e64, __riscv_vsseg7e64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg8e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e64, __riscv_vsseg8e64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg2e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e8, __riscv_vsseg2e8, 3, 2, 1)(__VA_ARGS__)
#define vsseg3e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e8, __riscv_vsseg3e8, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg4e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e8, __riscv_vsseg4e8, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg5e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e8, __riscv_vsseg5e8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg6e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e8, __riscv_vsseg6e8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg7e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e8, __riscv_vsseg7e8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsseg8e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e8, __riscv_vsseg8e8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg2e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e16, __riscv_vssseg2e16, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg3e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e16, __riscv_vssseg3e16, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg4e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e16, __riscv_vssseg4e16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg5e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e16, __riscv_vssseg5e16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg6e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e16, __riscv_vssseg6e16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg7e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e16, __riscv_vssseg7e16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg8e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e16, __riscv_vssseg8e16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg2e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e32, __riscv_vssseg2e32, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg3e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e32, __riscv_vssseg3e32, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg4e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e32, __riscv_vssseg4e32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg5e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e32, __riscv_vssseg5e32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg6e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e32, __riscv_vssseg6e32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg7e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e32, __riscv_vssseg7e32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg8e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e32, __riscv_vssseg8e32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg2e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e64, __riscv_vssseg2e64, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg3e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e64, __riscv_vssseg3e64, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg4e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e64, __riscv_vssseg4e64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg5e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e64, __riscv_vssseg5e64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg6e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e64, __riscv_vssseg6e64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg7e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e64, __riscv_vssseg7e64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg8e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e64, __riscv_vssseg8e64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg2e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e8, __riscv_vssseg2e8, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg3e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e8, __riscv_vssseg3e8, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg4e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e8, __riscv_vssseg4e8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg5e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e8, __riscv_vssseg5e8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg6e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e8, __riscv_vssseg6e8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg7e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e8, __riscv_vssseg7e8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vssseg8e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e8, __riscv_vssseg8e8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei8_tumu, 7, 6, __riscv_vloxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei8_tumu, 9, 8, 7, __riscv_vloxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei8_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei8_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei8_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei8_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei8_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei16_tumu, 7, 6, __riscv_vloxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei16_tumu, 9, 8, 7, __riscv_vloxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei16_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei16_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei16_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei16_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei16_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei32_tumu, 7, 6, __riscv_vloxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei32_tumu, 9, 8, 7, __riscv_vloxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei32_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei32_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei32_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei32_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei32_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei64_tumu, 7, 6, __riscv_vloxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei64_tumu, 9, 8, 7, __riscv_vloxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei64_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei64_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei64_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei64_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vloxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei64_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei8_tumu, 7, 6, __riscv_vluxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei8_tumu, 9, 8, 7, __riscv_vluxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei8_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei8_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei8_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei8_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei8_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei16_tumu, 7, 6, __riscv_vluxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei16_tumu, 9, 8, 7, __riscv_vluxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei16_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei16_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei16_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei16_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei16_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei32_tumu, 7, 6, __riscv_vluxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei32_tumu, 9, 8, 7, __riscv_vluxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei32_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei32_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei32_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei32_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei32_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei64_tumu, 7, 6, __riscv_vluxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei64_tumu, 9, 8, 7, __riscv_vluxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei64_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei64_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei64_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei64_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vluxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei64_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei8, __riscv_vsoxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei8, __riscv_vsoxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei8, __riscv_vsoxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei8, __riscv_vsoxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei8, __riscv_vsoxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei8, __riscv_vsoxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei8, __riscv_vsoxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei16, __riscv_vsoxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei16, __riscv_vsoxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei16, __riscv_vsoxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei16, __riscv_vsoxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei16, __riscv_vsoxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei16, __riscv_vsoxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei16, __riscv_vsoxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei32, __riscv_vsoxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei32, __riscv_vsoxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei32, __riscv_vsoxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei32, __riscv_vsoxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei32, __riscv_vsoxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei32, __riscv_vsoxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei32, __riscv_vsoxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei64, __riscv_vsoxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei64, __riscv_vsoxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei64, __riscv_vsoxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei64, __riscv_vsoxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei64, __riscv_vsoxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei64, __riscv_vsoxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsoxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei64, __riscv_vsoxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei8, __riscv_vsuxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei8, __riscv_vsuxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei8, __riscv_vsuxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei8, __riscv_vsuxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei8, __riscv_vsuxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei8, __riscv_vsuxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei8, __riscv_vsuxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei16, __riscv_vsuxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei16, __riscv_vsuxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei16, __riscv_vsuxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei16, __riscv_vsuxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei16, __riscv_vsuxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei16, __riscv_vsuxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei16, __riscv_vsuxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei32, __riscv_vsuxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei32, __riscv_vsuxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei32, __riscv_vsuxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei32, __riscv_vsuxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei32, __riscv_vsuxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei32, __riscv_vsuxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei32, __riscv_vsuxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei64, __riscv_vsuxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei64, __riscv_vsuxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei64, __riscv_vsuxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei64, __riscv_vsuxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei64, __riscv_vsuxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei64, __riscv_vsuxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vsuxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei64, __riscv_vsuxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
#define vadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vadd_tumu, 4, __riscv_vadd, 2, 1)(__VA_ARGS__)
#define vsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsub_tumu, 4, __riscv_vsub, 2, 1)(__VA_ARGS__)
#define vrsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrsub_tumu, 4, __riscv_vrsub, 2, 1)(__VA_ARGS__)
#define vneg(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vneg_tumu, 3, __riscv_vneg, 1)(__VA_ARGS__)
#define vwadd_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_vv_tumu, 4, __riscv_vwadd_vv, 2, 1)(__VA_ARGS__)
#define vwadd_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_vx_tumu, 4, __riscv_vwadd_vx, 2, 1)(__VA_ARGS__)
#define vwadd_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_wv_tumu, 4, __riscv_vwadd_wv, 2, 1)(__VA_ARGS__)
#define vwadd_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_wx_tumu, 4, __riscv_vwadd_wx, 2, 1)(__VA_ARGS__)
#define vwsub_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_vv_tumu, 4, __riscv_vwsub_vv, 2, 1)(__VA_ARGS__)
#define vwsub_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_vx_tumu, 4, __riscv_vwsub_vx, 2, 1)(__VA_ARGS__)
#define vwsub_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_wv_tumu, 4, __riscv_vwsub_wv, 2, 1)(__VA_ARGS__)
#define vwsub_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_wx_tumu, 4, __riscv_vwsub_wx, 2, 1)(__VA_ARGS__)
#define vwaddu_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_vv_tumu, 4, __riscv_vwaddu_vv, 2, 1)(__VA_ARGS__)
#define vwaddu_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_vx_tumu, 4, __riscv_vwaddu_vx, 2, 1)(__VA_ARGS__)
#define vwaddu_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_wv_tumu, 4, __riscv_vwaddu_wv, 2, 1)(__VA_ARGS__)
#define vwaddu_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_wx_tumu, 4, __riscv_vwaddu_wx, 2, 1)(__VA_ARGS__)
#define vwsubu_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_vv_tumu, 4, __riscv_vwsubu_vv, 2, 1)(__VA_ARGS__)
#define vwsubu_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_vx_tumu, 4, __riscv_vwsubu_vx, 2, 1)(__VA_ARGS__)
#define vwsubu_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_wv_tumu, 4, __riscv_vwsubu_wv, 2, 1)(__VA_ARGS__)
#define vwsubu_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_wx_tumu, 4, __riscv_vwsubu_wx, 2, 1)(__VA_ARGS__)
#define vsext_vf2(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf2_tumu, 3, __riscv_vsext_vf2, 1)(__VA_ARGS__)
#define vsext_vf4(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf4_tumu, 3, __riscv_vsext_vf4, 1)(__VA_ARGS__)
#define vsext_vf8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf8_tumu, 3, __riscv_vsext_vf8, 1)(__VA_ARGS__)
#define vzext_vf2(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf2_tumu, 3, __riscv_vzext_vf2, 1)(__VA_ARGS__)
#define vzext_vf4(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf4_tumu, 3, __riscv_vzext_vf4, 1)(__VA_ARGS__)
#define vzext_vf8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf8_tumu, 3, __riscv_vzext_vf8, 1)(__VA_ARGS__)
#define vadc(...) __riscv_vadc(__VA_ARGS__)
#define vsbc(...) __riscv_vsbc(__VA_ARGS__)
#define vmadc(...) __riscv_vmadc(__VA_ARGS__)
#define vmsbc(...) __riscv_vmsbc(__VA_ARGS__)
#define vand(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vand_tumu, 4, __riscv_vand, 2, 1)(__VA_ARGS__)
#define vor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vor_tumu, 4, __riscv_vor, 2, 1)(__VA_ARGS__)
#define vxor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vxor_tumu, 4, __riscv_vxor, 2, 1)(__VA_ARGS__)
#define vnot(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vnot_tumu, 3, __riscv_vnot, 1)(__VA_ARGS__)
#define vsll(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsll_tumu, 4, __riscv_vsll, 2, 1)(__VA_ARGS__)
#define vsra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsra_tumu, 4, __riscv_vsra, 2, 1)(__VA_ARGS__)
#define vsrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsrl_tumu, 4, __riscv_vsrl, 2, 1)(__VA_ARGS__)
#define vnsra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnsra_tumu, 4, __riscv_vnsra, 2, 1)(__VA_ARGS__)
#define vnsrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnsrl_tumu, 4, __riscv_vnsrl, 2, 1)(__VA_ARGS__)
#define vmseq(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmseq_mu, 4, __riscv_vmseq, 2, 1)(__VA_ARGS__)
#define vmsne(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsne_mu, 4, __riscv_vmsne, 2, 1)(__VA_ARGS__)
#define vmslt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmslt_mu, 4, __riscv_vmslt, 2, 1)(__VA_ARGS__)
#define vmsle(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsle_mu, 4, __riscv_vmsle, 2, 1)(__VA_ARGS__)
#define vmsgt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgt_mu, 4, __riscv_vmsgt, 2, 1)(__VA_ARGS__)
#define vmsge(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsge_mu, 4, __riscv_vmsge, 2, 1)(__VA_ARGS__)
#define vmsltu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsltu_mu, 4, __riscv_vmsltu, 2, 1)(__VA_ARGS__)
#define vmsleu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsleu_mu, 4, __riscv_vmsleu, 2, 1)(__VA_ARGS__)
#define vmsgtu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgtu_mu, 4, __riscv_vmsgtu, 2, 1)(__VA_ARGS__)
#define vmsgeu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgeu_mu, 4, __riscv_vmsgeu, 2, 1)(__VA_ARGS__)
#define vmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmin_tumu, 4, __riscv_vmin, 2, 1)(__VA_ARGS__)
#define vmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmax_tumu, 4, __riscv_vmax, 2, 1)(__VA_ARGS__)
#define vminu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vminu_tumu, 4, __riscv_vminu, 2, 1)(__VA_ARGS__)
#define vmaxu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmaxu_tumu, 4, __riscv_vmaxu, 2, 1)(__VA_ARGS__)
#define vmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmul_tumu, 4, __riscv_vmul, 2, 1)(__VA_ARGS__)
#define vmulh(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulh_tumu, 4, __riscv_vmulh, 2, 1)(__VA_ARGS__)
#define vmulhsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulhsu_tumu, 4, __riscv_vmulhsu, 2, 1)(__VA_ARGS__)
#define vmulhu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulhu_tumu, 4, __riscv_vmulhu, 2, 1)(__VA_ARGS__)
#define vdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vdiv_tumu, 4, __riscv_vdiv, 2, 1)(__VA_ARGS__)
#define vrem(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrem_tumu, 4, __riscv_vrem, 2, 1)(__VA_ARGS__)
#define vdivu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vdivu_tumu, 4, __riscv_vdivu, 2, 1)(__VA_ARGS__)
#define vremu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vremu_tumu, 4, __riscv_vremu, 2, 1)(__VA_ARGS__)
#define vwmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmul_tumu, 4, __riscv_vwmul, 2, 1)(__VA_ARGS__)
#define vwmulsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmulsu_tumu, 4, __riscv_vwmulsu, 2, 1)(__VA_ARGS__)
#define vwmulu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmulu_tumu, 4, __riscv_vwmulu, 2, 1)(__VA_ARGS__)
#define vmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmacc_tumu, __riscv_vmacc_tu, 3, 2, 1)(__VA_ARGS__)
#define vnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnmsac_tumu, __riscv_vnmsac_tu, 3, 2, 1)(__VA_ARGS__)
#define vmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmadd_tumu, __riscv_vmadd_tu, 3, 2, 1)(__VA_ARGS__)
#define vnmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnmsub_tumu, __riscv_vnmsub_tu, 3, 2, 1)(__VA_ARGS__)
#define vwmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmacc_tumu, __riscv_vwmacc_tu, 3, 2, 1)(__VA_ARGS__)
#define vwmaccsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccsu_tumu, __riscv_vwmaccsu_tu, 3, 2, 1)(__VA_ARGS__)
#define vwmaccus(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccus_tumu, __riscv_vwmaccus_tu, 3, 2, 1)(__VA_ARGS__)
#define vwmaccu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccu_tumu, __riscv_vwmaccu_tu, 3, 2, 1)(__VA_ARGS__)
#define vmv_v(...) __riscv_vmv_v(__VA_ARGS__)
#define vsadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsadd_tumu, 4, __riscv_vsadd, 2, 1)(__VA_ARGS__)
#define vssub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssub_tumu, 4, __riscv_vssub, 2, 1)(__VA_ARGS__)
#define vsaddu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsaddu_tumu, 4, __riscv_vsaddu, 2, 1)(__VA_ARGS__)
#define vssubu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssubu_tumu, 4, __riscv_vssubu, 2, 1)(__VA_ARGS__)
#define vaadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vaadd_tumu, 4, __riscv_vaadd, 2, 1)(__VA_ARGS__)
#define vasub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vasub_tumu, 4, __riscv_vasub, 2, 1)(__VA_ARGS__)
#define vaaddu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vaaddu_tumu, 4, __riscv_vaaddu, 2, 1)(__VA_ARGS__)
#define vasubu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vasubu_tumu, 4, __riscv_vasubu, 2, 1)(__VA_ARGS__)
#define vsmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsmul_mu, 4, __riscv_vsmul, 2, 1)(__VA_ARGS__)
#define vssra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssra_tumu, 4, __riscv_vssra, 2, 1)(__VA_ARGS__)
#define vssrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssrl_tumu, 4, __riscv_vssrl, 2, 1)(__VA_ARGS__)
#define vnclip(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnclip_tumu, 4, __riscv_vnclip, 2, 1)(__VA_ARGS__)
#define vnclipu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnclipu_tumu, 4, __riscv_vnclipu, 2, 1)(__VA_ARGS__)
#define vfadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfadd_tumu, 4, __riscv_vfadd, 2, 1)(__VA_ARGS__)
#define vfsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsub_tumu, 4, __riscv_vfsub, 2, 1)(__VA_ARGS__)
#define vfrsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfrsub_tumu, 4, __riscv_vfrsub, 2, 1)(__VA_ARGS__)
#define vfneg(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfneg_tumu, 3, __riscv_vfneg, 1)(__VA_ARGS__)
#define vfwadd_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_vv_tumu, 4, __riscv_vfwadd_vv, 2, 1)(__VA_ARGS__)
#define vfwadd_vf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_vf_tumu, 4, __riscv_vfwadd_vf, 2, 1)(__VA_ARGS__)
#define vfwadd_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_wv_tumu, 4, __riscv_vfwadd_wv, 2, 1)(__VA_ARGS__)
#define vfwadd_wf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_wf_tumu, 4, __riscv_vfwadd_wf, 2, 1)(__VA_ARGS__)
#define vfwsub_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_vv_tumu, 4, __riscv_vfwsub_vv, 2, 1)(__VA_ARGS__)
#define vfwsub_vf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_vf_tumu, 4, __riscv_vfwsub_vf, 2, 1)(__VA_ARGS__)
#define vfwsub_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_wv_tumu, 4, __riscv_vfwsub_wv, 2, 1)(__VA_ARGS__)
#define vfwsub_wf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_wf_tumu, 4, __riscv_vfwsub_wf, 2, 1)(__VA_ARGS__)
#define vfmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmul_tumu, 4, __riscv_vfmul, 2, 1)(__VA_ARGS__)
#define vfdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfdiv_tumu, 4, __riscv_vfdiv, 2, 1)(__VA_ARGS__)
#define vfrdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfrdiv_tumu, 4, __riscv_vfrdiv, 2, 1)(__VA_ARGS__)
#define vfwmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmul_tumu, 4, __riscv_vfwmul, 2, 1)(__VA_ARGS__)
#define vfmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmacc_tumu, __riscv_vfmacc_tu, 3, 2, 1)(__VA_ARGS__)
#define vfnmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmacc_tumu, __riscv_vfnmacc_tu, 3, 2, 1)(__VA_ARGS__)
#define vfmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmsac_tumu, __riscv_vfmsac_tu, 3, 2, 1)(__VA_ARGS__)
#define vfnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmsac_tumu, __riscv_vfnmsac_tu, 3, 2, 1)(__VA_ARGS__)
#define vfmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmadd_tumu, __riscv_vfmadd_tu, 3, 2, 1)(__VA_ARGS__)
#define vfnmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmadd_tumu, __riscv_vfnmadd_tu, 3, 2, 1)(__VA_ARGS__)
#define vfmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmsub_tumu, __riscv_vfmsub_tu, 3, 2, 1)(__VA_ARGS__)
#define vfnmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmsub_tumu, __riscv_vfnmsub_tu, 3, 2, 1)(__VA_ARGS__)
#define vfwmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmacc_tumu, __riscv_vfwmacc_tu, 3, 2, 1)(__VA_ARGS__)
#define vfwnmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwnmacc_tumu, __riscv_vfwnmacc_tu, 3, 2, 1)(__VA_ARGS__)
#define vfwmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmsac_tumu, __riscv_vfwmsac_tu, 3, 2, 1)(__VA_ARGS__)
#define vfwnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwnmsac_tumu, __riscv_vfwnmsac_tu, 3, 2, 1)(__VA_ARGS__)
#define vfsqrt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfsqrt_tumu, 3, __riscv_vfsqrt, 1)(__VA_ARGS__)
#define vfrsqrt7(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfrsqrt7_tumu, 3, __riscv_vfrsqrt7, 1)(__VA_ARGS__)
#define vfrec7(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfrec7_tumu, 3, __riscv_vfrec7, 1)(__VA_ARGS__)
#define vfmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmin_tumu, 4, __riscv_vfmin, 2, 1)(__VA_ARGS__)
#define vfmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmax_tumu, 4, __riscv_vfmax, 2, 1)(__VA_ARGS__)
#define vfsgnj(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnj_tumu, 4, __riscv_vfsgnj, 2, 1)(__VA_ARGS__)
#define vfsgnjn(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnjn_tumu, 4, __riscv_vfsgnjn, 2, 1)(__VA_ARGS__)
#define vfsgnjx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnjx_tumu, 4, __riscv_vfsgnjx, 2, 1)(__VA_ARGS__)
#define vfabs(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfabs_tumu, 3, __riscv_vfabs, 1)(__VA_ARGS__)
#define vmfeq(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfeq_mu, 4, __riscv_vmfeq, 2, 1)(__VA_ARGS__)
#define vmfne(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfne_mu, 4, __riscv_vmfne, 2, 1)(__VA_ARGS__)
#define vmflt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmflt_mu, 4, __riscv_vmflt, 2, 1)(__VA_ARGS__)
#define vmfle(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfle_mu, 4, __riscv_vmfle, 2, 1)(__VA_ARGS__)
#define vmfgt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfgt_mu, 4, __riscv_vmfgt, 2, 1)(__VA_ARGS__)
#define vmfge(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfge_mu, 4, __riscv_vmfge, 2, 1)(__VA_ARGS__)
#define vfclass(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfclass_tumu, 3, __riscv_vfclass, 1)(__VA_ARGS__)
#define vfcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_x_tumu, 3, __riscv_vfcvt_x, 1)(__VA_ARGS__)
#define vfcvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_rtz_x_tumu, 3, __riscv_vfcvt_rtz_x, 1)(__VA_ARGS__)
#define vfcvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_xu_tumu, 3, __riscv_vfcvt_xu, 1)(__VA_ARGS__)
#define vfcvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_rtz_xu_tumu, 3, __riscv_vfcvt_rtz_xu, 1)(__VA_ARGS__)
#define vfcvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_f_tumu, 3, __riscv_vfcvt_f, 1)(__VA_ARGS__)
#define vwcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vwcvt_x_tumu, 3, __riscv_vwcvt_x, 1)(__VA_ARGS__)
#define vwcvtu_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vwcvtu_x_tumu, 3, __riscv_vwcvtu_x, 1)(__VA_ARGS__)
#define vfwcvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_f_tumu, 3, __riscv_vfwcvt_f, 1)(__VA_ARGS__)
#define vfwcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_x_tumu, 3, __riscv_vfwcvt_x, 1)(__VA_ARGS__)
#define vfwcvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_rtz_x_tumu, 3, __riscv_vfwcvt_rtz_x, 1)(__VA_ARGS__)
#define vfwcvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_xu_tumu, 3, __riscv_vfwcvt_xu, 1)(__VA_ARGS__)
#define vfwcvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_rtz_xu_tumu, 3, __riscv_vfwcvt_rtz_xu, 1)(__VA_ARGS__)
#define vfncvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_x_tumu, 3, __riscv_vfncvt_x, 1)(__VA_ARGS__)
#define vfncvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rtz_x_tumu, 3, __riscv_vfncvt_rtz_x, 1)(__VA_ARGS__)
#define vncvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vncvt_x_tumu, 3, __riscv_vncvt_x, 1)(__VA_ARGS__)
#define vfncvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_xu_tumu, 3, __riscv_vfncvt_xu, 1)(__VA_ARGS__)
#define vfncvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rtz_xu_tumu, 3, __riscv_vfncvt_rtz_xu, 1)(__VA_ARGS__)
#define vfncvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_f_tumu, 3, __riscv_vfncvt_f, 1)(__VA_ARGS__)
#define vfncvt_rod_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rod_f_tumu, 3, __riscv_vfncvt_rod_f, 1)(__VA_ARGS__)
#define vredsum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredsum_tum, __riscv_vredsum_tu, 3, 2, 1)(__VA_ARGS__)
#define vredmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmax_tum, __riscv_vredmax_tu, 3, 2, 1)(__VA_ARGS__)
#define vredmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmin_tum, __riscv_vredmin_tu, 3, 2, 1)(__VA_ARGS__)
#define vredand(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredand_tum, __riscv_vredand_tu, 3, 2, 1)(__VA_ARGS__)
#define vredor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredor_tum, __riscv_vredor_tu, 3, 2, 1)(__VA_ARGS__)
#define vredxor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredxor_tum, __riscv_vredxor_tu, 3, 2, 1)(__VA_ARGS__)
#define vredmaxu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmaxu_tum, __riscv_vredmaxu_tu, 3, 2, 1)(__VA_ARGS__)
#define vredminu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredminu_tum, __riscv_vredminu_tu, 3, 2, 1)(__VA_ARGS__)
#define vwredsum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwredsum_tum, __riscv_vwredsum_tu, 3, 2, 1)(__VA_ARGS__)
#define vwredsumu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwredsumu_tum, __riscv_vwredsumu_tu, 3, 2, 1)(__VA_ARGS__)
#define vfredosum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredosum_tum, __riscv_vfredosum_tu, 3, 2, 1)(__VA_ARGS__)
#define vfredusum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredusum_tum, __riscv_vfredusum_tu, 3, 2, 1)(__VA_ARGS__)
#define vfredmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredmax_tum, __riscv_vfredmax_tu, 3, 2, 1)(__VA_ARGS__)
#define vfredmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredmin_tum, __riscv_vfredmin_tu, 3, 2, 1)(__VA_ARGS__)
#define vfwredosum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwredosum_tum, __riscv_vfwredosum_tu, 3, 2, 1)(__VA_ARGS__)
#define vfwredusum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwredusum_tum, __riscv_vfwredusum_tu, 3, 2, 1)(__VA_ARGS__)
#define vsm(...) __riscv_vsm(__VA_ARGS__)
#define vmand(...) __riscv_vmand(__VA_ARGS__)
#define vmnand(...) __riscv_vmnand(__VA_ARGS__)
#define vmandn(...) __riscv_vmandn(__VA_ARGS__)
#define vmxor(...) __riscv_vmxor(__VA_ARGS__)
#define vmor(...) __riscv_vmor(__VA_ARGS__)
#define vmnor(...) __riscv_vmnor(__VA_ARGS__)
#define vmorn(...) __riscv_vmorn(__VA_ARGS__)
#define vmxnor(...) __riscv_vmxnor(__VA_ARGS__)
#define vmmv(...) __riscv_vmmv(__VA_ARGS__)
#define vmnot(...) __riscv_vmnot(__VA_ARGS__)
#define vcpop(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, __riscv_vcpop, __riscv_vcpop, 1)(__VA_ARGS__)
#define vfirst(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, __riscv_vfirst, __riscv_vfirst, 1)(__VA_ARGS__)
#define vmsbf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsbf_mu, 3, __riscv_vmsbf, 1)(__VA_ARGS__)
#define vmsif(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsif_mu, 3, __riscv_vmsif, 1)(__VA_ARGS__)
#define vmsof(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsof_mu, 3, __riscv_vmsof, 1)(__VA_ARGS__)
#define vfmv_f(...) __riscv_vfmv_f(__VA_ARGS__)
#define vfmv_s(...) __riscv_vfmv_s_tu(__VA_ARGS__)
#define vmv_x(...) __riscv_vmv_x(__VA_ARGS__)
#define vmv_s(...) __riscv_vmv_s_tu(__VA_ARGS__)
#define vslideup(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslideup_tumu, __riscv_vslideup_tu, 3, 2, 1)(__VA_ARGS__)
#define vslidedown(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslidedown_tumu, __riscv_vslidedown_tu, 3, 2, 1)(__VA_ARGS__)
#define vfslide1up(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfslide1up_tumu, 4, __riscv_vfslide1up, 2, 1)(__VA_ARGS__)
#define vfslide1down(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfslide1down_tumu, 4, __riscv_vfslide1down, 2, 1)(__VA_ARGS__)
#define vslide1up(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslide1up_tumu, 4, __riscv_vslide1up, 2, 1)(__VA_ARGS__)
#define vslide1down(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslide1down_tumu, 4, __riscv_vslide1down, 2, 1)(__VA_ARGS__)
#define vrgather(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrgather_tumu, 4, __riscv_vrgather, 2, 1)(__VA_ARGS__)
#define vrgatherei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrgatherei16_tumu, 4, __riscv_vrgatherei16, 2, 1)(__VA_ARGS__)
#define vreinterpret_u8mf8(...) __riscv_vreinterpret_u8mf8(__VA_ARGS__)
#define vreinterpret_u8mf4(...) __riscv_vreinterpret_u8mf4(__VA_ARGS__)
#define vreinterpret_u8mf2(...) __riscv_vreinterpret_u8mf2(__VA_ARGS__)
#define vreinterpret_u8m1(...) __riscv_vreinterpret_u8m1(__VA_ARGS__)
#define vreinterpret_u8m2(...) __riscv_vreinterpret_u8m2(__VA_ARGS__)
#define vreinterpret_u8m4(...) __riscv_vreinterpret_u8m4(__VA_ARGS__)
#define vreinterpret_u8m8(...) __riscv_vreinterpret_u8m8(__VA_ARGS__)
#define vreinterpret_i8mf8(...) __riscv_vreinterpret_i8mf8(__VA_ARGS__)
#define vreinterpret_i8mf4(...) __riscv_vreinterpret_i8mf4(__VA_ARGS__)
#define vreinterpret_i8mf2(...) __riscv_vreinterpret_i8mf2(__VA_ARGS__)
#define vreinterpret_i8m1(...) __riscv_vreinterpret_i8m1(__VA_ARGS__)
#define vreinterpret_i8m2(...) __riscv_vreinterpret_i8m2(__VA_ARGS__)
#define vreinterpret_i8m4(...) __riscv_vreinterpret_i8m4(__VA_ARGS__)
#define vreinterpret_i8m8(...) __riscv_vreinterpret_i8m8(__VA_ARGS__)
#define vreinterpret_f16mf4(...) __riscv_vreinterpret_f16mf4(__VA_ARGS__)
#define vreinterpret_f16mf2(...) __riscv_vreinterpret_f16mf2(__VA_ARGS__)
#define vreinterpret_f16m1(...) __riscv_vreinterpret_f16m1(__VA_ARGS__)
#define vreinterpret_f16m2(...) __riscv_vreinterpret_f16m2(__VA_ARGS__)
#define vreinterpret_f16m4(...) __riscv_vreinterpret_f16m4(__VA_ARGS__)
#define vreinterpret_f16m8(...) __riscv_vreinterpret_f16m8(__VA_ARGS__)
#define vreinterpret_u16mf4(...) __riscv_vreinterpret_u16mf4(__VA_ARGS__)
#define vreinterpret_u16mf2(...) __riscv_vreinterpret_u16mf2(__VA_ARGS__)
#define vreinterpret_u16m1(...) __riscv_vreinterpret_u16m1(__VA_ARGS__)
#define vreinterpret_u16m2(...) __riscv_vreinterpret_u16m2(__VA_ARGS__)
#define vreinterpret_u16m4(...) __riscv_vreinterpret_u16m4(__VA_ARGS__)
#define vreinterpret_u16m8(...) __riscv_vreinterpret_u16m8(__VA_ARGS__)
#define vreinterpret_i16mf4(...) __riscv_vreinterpret_i16mf4(__VA_ARGS__)
#define vreinterpret_i16mf2(...) __riscv_vreinterpret_i16mf2(__VA_ARGS__)
#define vreinterpret_i16m1(...) __riscv_vreinterpret_i16m1(__VA_ARGS__)
#define vreinterpret_i16m2(...) __riscv_vreinterpret_i16m2(__VA_ARGS__)
#define vreinterpret_i16m4(...) __riscv_vreinterpret_i16m4(__VA_ARGS__)
#define vreinterpret_i16m8(...) __riscv_vreinterpret_i16m8(__VA_ARGS__)
#define vreinterpret_f32mf2(...) __riscv_vreinterpret_f32mf2(__VA_ARGS__)
#define vreinterpret_f32m1(...) __riscv_vreinterpret_f32m1(__VA_ARGS__)
#define vreinterpret_f32m2(...) __riscv_vreinterpret_f32m2(__VA_ARGS__)
#define vreinterpret_f32m4(...) __riscv_vreinterpret_f32m4(__VA_ARGS__)
#define vreinterpret_f32m8(...) __riscv_vreinterpret_f32m8(__VA_ARGS__)
#define vreinterpret_u32mf2(...) __riscv_vreinterpret_u32mf2(__VA_ARGS__)
#define vreinterpret_u32m1(...) __riscv_vreinterpret_u32m1(__VA_ARGS__)
#define vreinterpret_u32m2(...) __riscv_vreinterpret_u32m2(__VA_ARGS__)
#define vreinterpret_u32m4(...) __riscv_vreinterpret_u32m4(__VA_ARGS__)
#define vreinterpret_u32m8(...) __riscv_vreinterpret_u32m8(__VA_ARGS__)
#define vreinterpret_i32mf2(...) __riscv_vreinterpret_i32mf2(__VA_ARGS__)
#define vreinterpret_i32m1(...) __riscv_vreinterpret_i32m1(__VA_ARGS__)
#define vreinterpret_i32m2(...) __riscv_vreinterpret_i32m2(__VA_ARGS__)
#define vreinterpret_i32m4(...) __riscv_vreinterpret_i32m4(__VA_ARGS__)
#define vreinterpret_i32m8(...) __riscv_vreinterpret_i32m8(__VA_ARGS__)
#define vreinterpret_f64m1(...) __riscv_vreinterpret_f64m1(__VA_ARGS__)
#define vreinterpret_f64m2(...) __riscv_vreinterpret_f64m2(__VA_ARGS__)
#define vreinterpret_f64m4(...) __riscv_vreinterpret_f64m4(__VA_ARGS__)
#define vreinterpret_f64m8(...) __riscv_vreinterpret_f64m8(__VA_ARGS__)
#define vreinterpret_u64m1(...) __riscv_vreinterpret_u64m1(__VA_ARGS__)
#define vreinterpret_u64m2(...) __riscv_vreinterpret_u64m2(__VA_ARGS__)
#define vreinterpret_u64m4(...) __riscv_vreinterpret_u64m4(__VA_ARGS__)
#define vreinterpret_u64m8(...) __riscv_vreinterpret_u64m8(__VA_ARGS__)
#define vreinterpret_i64m1(...) __riscv_vreinterpret_i64m1(__VA_ARGS__)
#define vreinterpret_i64m2(...) __riscv_vreinterpret_i64m2(__VA_ARGS__)
#define vreinterpret_i64m4(...) __riscv_vreinterpret_i64m4(__VA_ARGS__)
#define vreinterpret_i64m8(...) __riscv_vreinterpret_i64m8(__VA_ARGS__)
#define vlmul_ext_f16mf2(...) __riscv_vlmul_ext_f16mf2(__VA_ARGS__)
#define vlmul_ext_f16m1(...) __riscv_vlmul_ext_f16m1(__VA_ARGS__)
#define vlmul_ext_f16m2(...) __riscv_vlmul_ext_f16m2(__VA_ARGS__)
#define vlmul_ext_f16m4(...) __riscv_vlmul_ext_f16m4(__VA_ARGS__)
#define vlmul_ext_f16m8(...) __riscv_vlmul_ext_f16m8(__VA_ARGS__)
#define vlmul_ext_f32m1(...) __riscv_vlmul_ext_f32m1(__VA_ARGS__)
#define vlmul_ext_f32m2(...) __riscv_vlmul_ext_f32m2(__VA_ARGS__)
#define vlmul_ext_f32m4(...) __riscv_vlmul_ext_f32m4(__VA_ARGS__)
#define vlmul_ext_f32m8(...) __riscv_vlmul_ext_f32m8(__VA_ARGS__)
#define vlmul_ext_f64m2(...) __riscv_vlmul_ext_f64m2(__VA_ARGS__)
#define vlmul_ext_f64m4(...) __riscv_vlmul_ext_f64m4(__VA_ARGS__)
#define vlmul_ext_f64m8(...) __riscv_vlmul_ext_f64m8(__VA_ARGS__)
#define vlmul_ext_i8mf4(...) __riscv_vlmul_ext_i8mf4(__VA_ARGS__)
#define vlmul_ext_i8mf2(...) __riscv_vlmul_ext_i8mf2(__VA_ARGS__)
#define vlmul_ext_i8m1(...) __riscv_vlmul_ext_i8m1(__VA_ARGS__)
#define vlmul_ext_i8m2(...) __riscv_vlmul_ext_i8m2(__VA_ARGS__)
#define vlmul_ext_i8m4(...) __riscv_vlmul_ext_i8m4(__VA_ARGS__)
#define vlmul_ext_i8m8(...) __riscv_vlmul_ext_i8m8(__VA_ARGS__)
#define vlmul_ext_i16mf2(...) __riscv_vlmul_ext_i16mf2(__VA_ARGS__)
#define vlmul_ext_i16m1(...) __riscv_vlmul_ext_i16m1(__VA_ARGS__)
#define vlmul_ext_i16m2(...) __riscv_vlmul_ext_i16m2(__VA_ARGS__)
#define vlmul_ext_i16m4(...) __riscv_vlmul_ext_i16m4(__VA_ARGS__)
#define vlmul_ext_i16m8(...) __riscv_vlmul_ext_i16m8(__VA_ARGS__)
#define vlmul_ext_i32m1(...) __riscv_vlmul_ext_i32m1(__VA_ARGS__)
#define vlmul_ext_i32m2(...) __riscv_vlmul_ext_i32m2(__VA_ARGS__)
#define vlmul_ext_i32m4(...) __riscv_vlmul_ext_i32m4(__VA_ARGS__)
#define vlmul_ext_i32m8(...) __riscv_vlmul_ext_i32m8(__VA_ARGS__)
#define vlmul_ext_i64m2(...) __riscv_vlmul_ext_i64m2(__VA_ARGS__)
#define vlmul_ext_i64m4(...) __riscv_vlmul_ext_i64m4(__VA_ARGS__)
#define vlmul_ext_i64m8(...) __riscv_vlmul_ext_i64m8(__VA_ARGS__)
#define vlmul_ext_u8mf4(...) __riscv_vlmul_ext_u8mf4(__VA_ARGS__)
#define vlmul_ext_u8mf2(...) __riscv_vlmul_ext_u8mf2(__VA_ARGS__)
#define vlmul_ext_u8m1(...) __riscv_vlmul_ext_u8m1(__VA_ARGS__)
#define vlmul_ext_u8m2(...) __riscv_vlmul_ext_u8m2(__VA_ARGS__)
#define vlmul_ext_u8m4(...) __riscv_vlmul_ext_u8m4(__VA_ARGS__)
#define vlmul_ext_u8m8(...) __riscv_vlmul_ext_u8m8(__VA_ARGS__)
#define vlmul_ext_u16mf2(...) __riscv_vlmul_ext_u16mf2(__VA_ARGS__)
#define vlmul_ext_u16m1(...) __riscv_vlmul_ext_u16m1(__VA_ARGS__)
#define vlmul_ext_u16m2(...) __riscv_vlmul_ext_u16m2(__VA_ARGS__)
#define vlmul_ext_u16m4(...) __riscv_vlmul_ext_u16m4(__VA_ARGS__)
#define vlmul_ext_u16m8(...) __riscv_vlmul_ext_u16m8(__VA_ARGS__)
#define vlmul_ext_u32m1(...) __riscv_vlmul_ext_u32m1(__VA_ARGS__)
#define vlmul_ext_u32m2(...) __riscv_vlmul_ext_u32m2(__VA_ARGS__)
#define vlmul_ext_u32m4(...) __riscv_vlmul_ext_u32m4(__VA_ARGS__)
#define vlmul_ext_u32m8(...) __riscv_vlmul_ext_u32m8(__VA_ARGS__)
#define vlmul_ext_u64m2(...) __riscv_vlmul_ext_u64m2(__VA_ARGS__)
#define vlmul_ext_u64m4(...) __riscv_vlmul_ext_u64m4(__VA_ARGS__)
#define vlmul_ext_u64m8(...) __riscv_vlmul_ext_u64m8(__VA_ARGS__)
#define vlmul_trunc_f16mf4(...) __riscv_vlmul_trunc_f16mf4(__VA_ARGS__)
#define vlmul_trunc_f16mf2(...) __riscv_vlmul_trunc_f16mf2(__VA_ARGS__)
#define vlmul_trunc_f16m1(...) __riscv_vlmul_trunc_f16m1(__VA_ARGS__)
#define vlmul_trunc_f16m2(...) __riscv_vlmul_trunc_f16m2(__VA_ARGS__)
#define vlmul_trunc_f16m4(...) __riscv_vlmul_trunc_f16m4(__VA_ARGS__)
#define vlmul_trunc_f32mf2(...) __riscv_vlmul_trunc_f32mf2(__VA_ARGS__)
#define vlmul_trunc_f32m1(...) __riscv_vlmul_trunc_f32m1(__VA_ARGS__)
#define vlmul_trunc_f32m2(...) __riscv_vlmul_trunc_f32m2(__VA_ARGS__)
#define vlmul_trunc_f32m4(...) __riscv_vlmul_trunc_f32m4(__VA_ARGS__)
#define vlmul_trunc_f64m1(...) __riscv_vlmul_trunc_f64m1(__VA_ARGS__)
#define vlmul_trunc_f64m2(...) __riscv_vlmul_trunc_f64m2(__VA_ARGS__)
#define vlmul_trunc_f64m4(...) __riscv_vlmul_trunc_f64m4(__VA_ARGS__)
#define vlmul_trunc_i8mf8(...) __riscv_vlmul_trunc_i8mf8(__VA_ARGS__)
#define vlmul_trunc_i8mf4(...) __riscv_vlmul_trunc_i8mf4(__VA_ARGS__)
#define vlmul_trunc_i8mf2(...) __riscv_vlmul_trunc_i8mf2(__VA_ARGS__)
#define vlmul_trunc_i8m1(...) __riscv_vlmul_trunc_i8m1(__VA_ARGS__)
#define vlmul_trunc_i8m2(...) __riscv_vlmul_trunc_i8m2(__VA_ARGS__)
#define vlmul_trunc_i8m4(...) __riscv_vlmul_trunc_i8m4(__VA_ARGS__)
#define vlmul_trunc_i16mf4(...) __riscv_vlmul_trunc_i16mf4(__VA_ARGS__)
#define vlmul_trunc_i16mf2(...) __riscv_vlmul_trunc_i16mf2(__VA_ARGS__)
#define vlmul_trunc_i16m1(...) __riscv_vlmul_trunc_i16m1(__VA_ARGS__)
#define vlmul_trunc_i16m2(...) __riscv_vlmul_trunc_i16m2(__VA_ARGS__)
#define vlmul_trunc_i16m4(...) __riscv_vlmul_trunc_i16m4(__VA_ARGS__)
#define vlmul_trunc_i32mf2(...) __riscv_vlmul_trunc_i32mf2(__VA_ARGS__)
#define vlmul_trunc_i32m1(...) __riscv_vlmul_trunc_i32m1(__VA_ARGS__)
#define vlmul_trunc_i32m2(...) __riscv_vlmul_trunc_i32m2(__VA_ARGS__)
#define vlmul_trunc_i32m4(...) __riscv_vlmul_trunc_i32m4(__VA_ARGS__)
#define vlmul_trunc_i64m1(...) __riscv_vlmul_trunc_i64m1(__VA_ARGS__)
#define vlmul_trunc_i64m2(...) __riscv_vlmul_trunc_i64m2(__VA_ARGS__)
#define vlmul_trunc_i64m4(...) __riscv_vlmul_trunc_i64m4(__VA_ARGS__)
#define vlmul_trunc_u8mf8(...) __riscv_vlmul_trunc_u8mf8(__VA_ARGS__)
#define vlmul_trunc_u8mf4(...) __riscv_vlmul_trunc_u8mf4(__VA_ARGS__)
#define vlmul_trunc_u8mf2(...) __riscv_vlmul_trunc_u8mf2(__VA_ARGS__)
#define vlmul_trunc_u8m1(...) __riscv_vlmul_trunc_u8m1(__VA_ARGS__)
#define vlmul_trunc_u8m2(...) __riscv_vlmul_trunc_u8m2(__VA_ARGS__)
#define vlmul_trunc_u8m4(...) __riscv_vlmul_trunc_u8m4(__VA_ARGS__)
#define vlmul_trunc_u16mf4(...) __riscv_vlmul_trunc_u16mf4(__VA_ARGS__)
#define vlmul_trunc_u16mf2(...) __riscv_vlmul_trunc_u16mf2(__VA_ARGS__)
#define vlmul_trunc_u16m1(...) __riscv_vlmul_trunc_u16m1(__VA_ARGS__)
#define vlmul_trunc_u16m2(...) __riscv_vlmul_trunc_u16m2(__VA_ARGS__)
#define vlmul_trunc_u16m4(...) __riscv_vlmul_trunc_u16m4(__VA_ARGS__)
#define vlmul_trunc_u32mf2(...) __riscv_vlmul_trunc_u32mf2(__VA_ARGS__)
#define vlmul_trunc_u32m1(...) __riscv_vlmul_trunc_u32m1(__VA_ARGS__)
#define vlmul_trunc_u32m2(...) __riscv_vlmul_trunc_u32m2(__VA_ARGS__)
#define vlmul_trunc_u32m4(...) __riscv_vlmul_trunc_u32m4(__VA_ARGS__)
#define vlmul_trunc_u64m1(...) __riscv_vlmul_trunc_u64m1(__VA_ARGS__)
#define vlmul_trunc_u64m2(...) __riscv_vlmul_trunc_u64m2(__VA_ARGS__)
#define vlmul_trunc_u64m4(...) __riscv_vlmul_trunc_u64m4(__VA_ARGS__)
#define vset(...) __riscv_vset(__VA_ARGS__)
#define vget_f16m1(...) __riscv_vget_f16m1(__VA_ARGS__)
#define vget_f16m2(...) __riscv_vget_f16m2(__VA_ARGS__)
#define vget_f16m4(...) __riscv_vget_f16m4(__VA_ARGS__)
#define vget_f32m1(...) __riscv_vget_f32m1(__VA_ARGS__)
#define vget_f32m2(...) __riscv_vget_f32m2(__VA_ARGS__)
#define vget_f32m4(...) __riscv_vget_f32m4(__VA_ARGS__)
#define vget_f64m1(...) __riscv_vget_f64m1(__VA_ARGS__)
#define vget_f64m2(...) __riscv_vget_f64m2(__VA_ARGS__)
#define vget_f64m4(...) __riscv_vget_f64m4(__VA_ARGS__)
#define vget_i8m1(...) __riscv_vget_i8m1(__VA_ARGS__)
#define vget_i8m2(...) __riscv_vget_i8m2(__VA_ARGS__)
#define vget_i8m4(...) __riscv_vget_i8m4(__VA_ARGS__)
#define vget_i16m1(...) __riscv_vget_i16m1(__VA_ARGS__)
#define vget_i16m2(...) __riscv_vget_i16m2(__VA_ARGS__)
#define vget_i16m4(...) __riscv_vget_i16m4(__VA_ARGS__)
#define vget_i32m1(...) __riscv_vget_i32m1(__VA_ARGS__)
#define vget_i32m2(...) __riscv_vget_i32m2(__VA_ARGS__)
#define vget_i32m4(...) __riscv_vget_i32m4(__VA_ARGS__)
#define vget_i64m1(...) __riscv_vget_i64m1(__VA_ARGS__)
#define vget_i64m2(...) __riscv_vget_i64m2(__VA_ARGS__)
#define vget_i64m4(...) __riscv_vget_i64m4(__VA_ARGS__)
#define vget_u8m1(...) __riscv_vget_u8m1(__VA_ARGS__)
#define vget_u8m2(...) __riscv_vget_u8m2(__VA_ARGS__)
#define vget_u8m4(...) __riscv_vget_u8m4(__VA_ARGS__)
#define vget_u16m1(...) __riscv_vget_u16m1(__VA_ARGS__)
#define vget_u16m2(...) __riscv_vget_u16m2(__VA_ARGS__)
#define vget_u16m4(...) __riscv_vget_u16m4(__VA_ARGS__)
#define vget_u32m1(...) __riscv_vget_u32m1(__VA_ARGS__)
#define vget_u32m2(...) __riscv_vget_u32m2(__VA_ARGS__)
#define vget_u32m4(...) __riscv_vget_u32m4(__VA_ARGS__)
#define vget_u64m1(...) __riscv_vget_u64m1(__VA_ARGS__)
#define vget_u64m2(...) __riscv_vget_u64m2(__VA_ARGS__)
#define vget_u64m4(...) __riscv_vget_u64m4(__VA_ARGS__)
#define vle16(...) __riscv_vle16_tumu(__VA_ARGS__)
#define vle32(...) __riscv_vle32_tumu(__VA_ARGS__)
#define vle64(...) __riscv_vle64_tumu(__VA_ARGS__)
#define vle8(...) __riscv_vle8_tumu(__VA_ARGS__)
#define vlse16(...) __riscv_vlse16_tumu(__VA_ARGS__)
#define vlse32(...) __riscv_vlse32_tumu(__VA_ARGS__)
#define vlse64(...) __riscv_vlse64_tumu(__VA_ARGS__)
#define vlse8(...) __riscv_vlse8_tumu(__VA_ARGS__)
#define vle16ff(...) __riscv_vle16ff_tumu(__VA_ARGS__)
#define vle32ff(...) __riscv_vle32ff_tumu(__VA_ARGS__)
#define vle64ff(...) __riscv_vle64ff_tumu(__VA_ARGS__)
#define vle8ff(...) __riscv_vle8ff_tumu(__VA_ARGS__)
#define vlseg2e16(...) __riscv_vlseg2e16_tumu(__VA_ARGS__)
#define vlseg3e16(...) __riscv_vlseg3e16_tumu(__VA_ARGS__)
#define vlseg4e16(...) __riscv_vlseg4e16_tumu(__VA_ARGS__)
#define vlseg5e16(...) __riscv_vlseg5e16_tumu(__VA_ARGS__)
#define vlseg6e16(...) __riscv_vlseg6e16_tumu(__VA_ARGS__)
#define vlseg7e16(...) __riscv_vlseg7e16_tumu(__VA_ARGS__)
#define vlseg8e16(...) __riscv_vlseg8e16_tumu(__VA_ARGS__)
#define vlseg2e32(...) __riscv_vlseg2e32_tumu(__VA_ARGS__)
#define vlseg3e32(...) __riscv_vlseg3e32_tumu(__VA_ARGS__)
#define vlseg4e32(...) __riscv_vlseg4e32_tumu(__VA_ARGS__)
#define vlseg5e32(...) __riscv_vlseg5e32_tumu(__VA_ARGS__)
#define vlseg6e32(...) __riscv_vlseg6e32_tumu(__VA_ARGS__)
#define vlseg7e32(...) __riscv_vlseg7e32_tumu(__VA_ARGS__)
#define vlseg8e32(...) __riscv_vlseg8e32_tumu(__VA_ARGS__)
#define vlseg2e64(...) __riscv_vlseg2e64_tumu(__VA_ARGS__)
#define vlseg3e64(...) __riscv_vlseg3e64_tumu(__VA_ARGS__)
#define vlseg4e64(...) __riscv_vlseg4e64_tumu(__VA_ARGS__)
#define vlseg5e64(...) __riscv_vlseg5e64_tumu(__VA_ARGS__)
#define vlseg6e64(...) __riscv_vlseg6e64_tumu(__VA_ARGS__)
#define vlseg7e64(...) __riscv_vlseg7e64_tumu(__VA_ARGS__)
#define vlseg8e64(...) __riscv_vlseg8e64_tumu(__VA_ARGS__)
#define vlseg2e16ff(...) __riscv_vlseg2e16ff_tumu(__VA_ARGS__)
#define vlseg3e16ff(...) __riscv_vlseg3e16ff_tumu(__VA_ARGS__)
#define vlseg4e16ff(...) __riscv_vlseg4e16ff_tumu(__VA_ARGS__)
#define vlseg5e16ff(...) __riscv_vlseg5e16ff_tumu(__VA_ARGS__)
#define vlseg6e16ff(...) __riscv_vlseg6e16ff_tumu(__VA_ARGS__)
#define vlseg7e16ff(...) __riscv_vlseg7e16ff_tumu(__VA_ARGS__)
#define vlseg8e16ff(...) __riscv_vlseg8e16ff_tumu(__VA_ARGS__)
#define vlseg2e32ff(...) __riscv_vlseg2e32ff_tumu(__VA_ARGS__)
#define vlseg3e32ff(...) __riscv_vlseg3e32ff_tumu(__VA_ARGS__)
#define vlseg4e32ff(...) __riscv_vlseg4e32ff_tumu(__VA_ARGS__)
#define vlseg5e32ff(...) __riscv_vlseg5e32ff_tumu(__VA_ARGS__)
#define vlseg6e32ff(...) __riscv_vlseg6e32ff_tumu(__VA_ARGS__)
#define vlseg7e32ff(...) __riscv_vlseg7e32ff_tumu(__VA_ARGS__)
#define vlseg8e32ff(...) __riscv_vlseg8e32ff_tumu(__VA_ARGS__)
#define vlseg2e64ff(...) __riscv_vlseg2e64ff_tumu(__VA_ARGS__)
#define vlseg3e64ff(...) __riscv_vlseg3e64ff_tumu(__VA_ARGS__)
#define vlseg4e64ff(...) __riscv_vlseg4e64ff_tumu(__VA_ARGS__)
#define vlseg5e64ff(...) __riscv_vlseg5e64ff_tumu(__VA_ARGS__)
#define vlseg6e64ff(...) __riscv_vlseg6e64ff_tumu(__VA_ARGS__)
#define vlseg7e64ff(...) __riscv_vlseg7e64ff_tumu(__VA_ARGS__)
#define vlseg8e64ff(...) __riscv_vlseg8e64ff_tumu(__VA_ARGS__)
#define vlseg2e8(...) __riscv_vlseg2e8_tumu(__VA_ARGS__)
#define vlseg3e8(...) __riscv_vlseg3e8_tumu(__VA_ARGS__)
#define vlseg4e8(...) __riscv_vlseg4e8_tumu(__VA_ARGS__)
#define vlseg5e8(...) __riscv_vlseg5e8_tumu(__VA_ARGS__)
#define vlseg6e8(...) __riscv_vlseg6e8_tumu(__VA_ARGS__)
#define vlseg7e8(...) __riscv_vlseg7e8_tumu(__VA_ARGS__)
#define vlseg8e8(...) __riscv_vlseg8e8_tumu(__VA_ARGS__)
#define vlseg2e8ff(...) __riscv_vlseg2e8ff_tumu(__VA_ARGS__)
#define vlseg3e8ff(...) __riscv_vlseg3e8ff_tumu(__VA_ARGS__)
#define vlseg4e8ff(...) __riscv_vlseg4e8ff_tumu(__VA_ARGS__)
#define vlseg5e8ff(...) __riscv_vlseg5e8ff_tumu(__VA_ARGS__)
#define vlseg6e8ff(...) __riscv_vlseg6e8ff_tumu(__VA_ARGS__)
#define vlseg7e8ff(...) __riscv_vlseg7e8ff_tumu(__VA_ARGS__)
#define vlseg8e8ff(...) __riscv_vlseg8e8ff_tumu(__VA_ARGS__)
#define vlsseg2e16(...) __riscv_vlsseg2e16_tumu(__VA_ARGS__)
#define vlsseg3e16(...) __riscv_vlsseg3e16_tumu(__VA_ARGS__)
#define vlsseg4e16(...) __riscv_vlsseg4e16_tumu(__VA_ARGS__)
#define vlsseg5e16(...) __riscv_vlsseg5e16_tumu(__VA_ARGS__)
#define vlsseg6e16(...) __riscv_vlsseg6e16_tumu(__VA_ARGS__)
#define vlsseg7e16(...) __riscv_vlsseg7e16_tumu(__VA_ARGS__)
#define vlsseg8e16(...) __riscv_vlsseg8e16_tumu(__VA_ARGS__)
#define vlsseg2e32(...) __riscv_vlsseg2e32_tumu(__VA_ARGS__)
#define vlsseg3e32(...) __riscv_vlsseg3e32_tumu(__VA_ARGS__)
#define vlsseg4e32(...) __riscv_vlsseg4e32_tumu(__VA_ARGS__)
#define vlsseg5e32(...) __riscv_vlsseg5e32_tumu(__VA_ARGS__)
#define vlsseg6e32(...) __riscv_vlsseg6e32_tumu(__VA_ARGS__)
#define vlsseg7e32(...) __riscv_vlsseg7e32_tumu(__VA_ARGS__)
#define vlsseg8e32(...) __riscv_vlsseg8e32_tumu(__VA_ARGS__)
#define vlsseg2e64(...) __riscv_vlsseg2e64_tumu(__VA_ARGS__)
#define vlsseg3e64(...) __riscv_vlsseg3e64_tumu(__VA_ARGS__)
#define vlsseg4e64(...) __riscv_vlsseg4e64_tumu(__VA_ARGS__)
#define vlsseg5e64(...) __riscv_vlsseg5e64_tumu(__VA_ARGS__)
#define vlsseg6e64(...) __riscv_vlsseg6e64_tumu(__VA_ARGS__)
#define vlsseg7e64(...) __riscv_vlsseg7e64_tumu(__VA_ARGS__)
#define vlsseg8e64(...) __riscv_vlsseg8e64_tumu(__VA_ARGS__)
#define vlsseg2e8(...) __riscv_vlsseg2e8_tumu(__VA_ARGS__)
#define vlsseg3e8(...) __riscv_vlsseg3e8_tumu(__VA_ARGS__)
#define vlsseg4e8(...) __riscv_vlsseg4e8_tumu(__VA_ARGS__)
#define vlsseg5e8(...) __riscv_vlsseg5e8_tumu(__VA_ARGS__)
#define vlsseg6e8(...) __riscv_vlsseg6e8_tumu(__VA_ARGS__)
#define vlsseg7e8(...) __riscv_vlsseg7e8_tumu(__VA_ARGS__)
#define vlsseg8e8(...) __riscv_vlsseg8e8_tumu(__VA_ARGS__)
#define viota(...) __riscv_viota_tumu(__VA_ARGS__)
#define vid(...) __riscv_vid_tumu(__VA_ARGS__)
#endif

View File

@ -1,33 +0,0 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// 0.11 -> 0.12 compatibility
#ifndef _RVV_IMPLICIT_VXRM
#define _RVV_IMPLICIT_VXRM __RISCV_VXRM_RNU
#endif
// NOTE: masked should go first to avoid extra substitution (3 arg -> 4 arg -> 5 arg)
// masked
#define __riscv_vaadd(_1, _2, _3, _4) __riscv_vaadd(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vasub(_1, _2, _3, _4) __riscv_vasub(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vaaddu(_1, _2, _3, _4) __riscv_vaaddu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vasubu(_1, _2, _3, _4) __riscv_vasubu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vsmul(_1, _2, _3, _4) __riscv_vsmul(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vssra(_1, _2, _3, _4) __riscv_vssra(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vssrl(_1, _2, _3, _4) __riscv_vssrl(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vnclip(_1, _2, _3, _4) __riscv_vnclip(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
#define __riscv_vnclipu(_1, _2, _3, _4) __riscv_vnclipu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
// unmasked
#define __riscv_vaadd(_1, _2, _3) __riscv_vaadd(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vasub(_1, _2, _3) __riscv_vasub(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vaaddu(_1, _2, _3) __riscv_vaaddu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vasubu(_1, _2, _3) __riscv_vasubu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vsmul(_1, _2, _3) __riscv_vsmul(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vssra(_1, _2, _3) __riscv_vssra(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vssrl(_1, _2, _3) __riscv_vssrl(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vnclip(_1, _2, _3) __riscv_vnclip(_1, _2, _RVV_IMPLICIT_VXRM, _3)
#define __riscv_vnclipu(_1, _2, _3) __riscv_vnclipu(_1, _2, _RVV_IMPLICIT_VXRM, _3)

View File

@ -1,213 +0,0 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
#define OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
// This file requires VTraits to be defined for vector types
#define OPENCV_HAL_IMPL_RVV_FUN_AND(REG, SUF) \
inline static REG vand(const REG & op1, const REG & op2, size_t vl) \
{ \
return vand_vv_##SUF(op1, op2, vl); \
}
OPENCV_HAL_IMPL_RVV_FUN_AND(vint8m1_t, i8m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vuint8m1_t, u8m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vint16m1_t, i16m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vuint16m1_t, u16m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vint32m1_t, i32m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vuint32m1_t, u32m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vint64m1_t, i64m1)
OPENCV_HAL_IMPL_RVV_FUN_AND(vuint64m1_t, u64m1)
#define OPENCV_HAL_IMPL_RVV_FUN_LOXEI(REG, SUF, INDX, ISUF) \
inline static REG vloxe##ISUF(const VTraits<REG>::lane_type *base, INDX bindex, size_t vl) \
{ \
return vloxe##ISUF##_v_##SUF(base, bindex, vl); \
}
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m1_t, i8m1, vuint8m1_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m2_t, i8m2, vuint8m2_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m4_t, i8m4, vuint8m4_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m8_t, i8m8, vuint8m8_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m1_t, i8m1, vuint32m4_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m2_t, i8m2, vuint32m8_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint16m1_t, i16m1, vuint32m2_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m1_t, i32m1, vuint32m1_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m2_t, i32m2, vuint32m2_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m4_t, i32m4, vuint32m4_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m8_t, i32m8, vuint32m8_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint64m1_t, i64m1, vuint32mf2_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m1_t, u8m1, vuint8m1_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m2_t, u8m2, vuint8m2_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m4_t, u8m4, vuint8m4_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m8_t, u8m8, vuint8m8_t, i8)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat32m1_t, f32m1, vuint32m1_t, i32)
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint32m1_t, u32m1, vuint32m1_t, i32)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat64m1_t, f64m1, vuint32mf2_t, i32)
#endif
#define OPENCV_HAL_IMPL_RVV_FUN_MUL(REG, SUF) \
inline static REG##m1_t vmul(const REG##m1_t & op1, const REG##m1_t & op2, size_t vl) \
{ \
return vmul_vv_##SUF##m1(op1, op2, vl); \
} \
inline static REG##m1_t vmul(const REG##m1_t & op1, VTraits<REG##m1_t>::lane_type op2, size_t vl) \
{ \
return vmul_vx_##SUF##m1(op1, op2, vl); \
} \
inline static REG##m2_t vmul(const REG##m2_t & op1, const REG##m2_t & op2, size_t vl) \
{ \
return vmul_vv_##SUF##m2(op1, op2, vl); \
} \
inline static REG##m2_t vmul(const REG##m2_t & op1, VTraits<REG##m2_t>::lane_type op2, size_t vl) \
{ \
return vmul_vx_##SUF##m2(op1, op2, vl); \
} \
inline static REG##m4_t vmul(const REG##m4_t & op1, const REG##m4_t & op2, size_t vl) \
{ \
return vmul_vv_##SUF##m4(op1, op2, vl); \
} \
inline static REG##m4_t vmul(const REG##m4_t & op1, VTraits<REG##m4_t>::lane_type op2, size_t vl) \
{ \
return vmul_vx_##SUF##m4(op1, op2, vl); \
} \
inline static REG##m8_t vmul(const REG##m8_t & op1, const REG##m8_t & op2, size_t vl) \
{ \
return vmul_vv_##SUF##m8(op1, op2, vl); \
} \
inline static REG##m8_t vmul(const REG##m8_t & op1, VTraits<REG##m8_t>::lane_type op2, size_t vl) \
{ \
return vmul_vx_##SUF##m8(op1, op2, vl); \
}
OPENCV_HAL_IMPL_RVV_FUN_MUL(vint8, i8)
OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint8, u8)
OPENCV_HAL_IMPL_RVV_FUN_MUL(vint16, i16)
OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint16, u16)
OPENCV_HAL_IMPL_RVV_FUN_MUL(vint32, i32)
OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint32, u32)
#define OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(REG1, SUF1, REG2, SUF2) \
inline static REG1##m1_t vreinterpret_##SUF1##m1(const REG2##m1_t & src) \
{\
return vreinterpret_v_##SUF2##m1_##SUF1##m1(src); \
} \
inline static REG1##m2_t vreinterpret_##SUF1##m2(const REG2##m2_t & src) \
{\
return vreinterpret_v_##SUF2##m2_##SUF1##m2(src); \
} \
inline static REG1##m4_t vreinterpret_##SUF1##m4(const REG2##m4_t & src) \
{\
return vreinterpret_v_##SUF2##m4_##SUF1##m4(src); \
} \
inline static REG1##m8_t vreinterpret_##SUF1##m8(const REG2##m8_t & src) \
{\
return vreinterpret_v_##SUF2##m8_##SUF1##m8(src); \
}
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint8, i8, vuint8, u8)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint16, i16, vuint16, u16)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint32, i32, vuint32, u32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vfloat32, f32, vuint32, u32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vfloat32, f32, vint32, i32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vfloat32, f32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint32, i32, vfloat32, f32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vint8, i8)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint16, u16)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint32, u32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint64, u64)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vint16, i16)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint8, u8)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint32, u32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint64, u64)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vint32, i32)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint8, u8)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint16, u16)
OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint64, u64)
#define OPENCV_HAL_IMPL_RVV_FUN_STORE(REG, SUF, SZ) \
inline static void vse##SZ(VTraits<REG>::lane_type *base, REG value, size_t vl) \
{ \
return vse##SZ##_v_##SUF##m1(base, value, vl); \
}
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint8, u8, 8)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int8, i8, 8)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint16, u16, 16)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int16, i16, 16)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint32, u32, 32)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int32, i32, 32)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint64, u64, 64)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int64, i64, 64)
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_float32, f32, 32)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_FUN_STORE(v_float64, f64, 64)
#endif
#define OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(REG, SUF) \
inline static VTraits<REG>::lane_type vmv_x(const REG & reg) \
{\
return vmv_x_s_##SUF##m1_##SUF(reg); \
}
#define OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(REG, SUF) \
inline static VTraits<REG>::lane_type vfmv_f(const REG & reg) \
{\
return vfmv_f_s_##SUF##m1_##SUF(reg); \
}
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint8, u8)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int8, i8)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint16, u16)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int16, i16)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint32, u32)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int32, i32)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint64, u64)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int64, i64)
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(v_float32, f32)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(v_float64, f64)
#endif
#define OPENCV_HAL_IMPL_RVV_FUN_SLIDE(REG, SUF) \
inline static REG vslidedown(const REG & dst, const REG & src, size_t offset, size_t vl) \
{ \
return vslidedown_vx_##SUF##m1(dst, src, offset, vl); \
} \
inline static REG vslideup(const REG & dst, const REG & src, size_t offset, size_t vl) \
{ \
return vslideup_vx_##SUF##m1(dst, src, offset, vl); \
}
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint8, u8)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int8, i8)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint16, u16)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int16, i16)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint32, u32)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int32, i32)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_float32, f32)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint64, u64)
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int64, i64)
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_float64, f64)
#endif
inline static vuint32mf2_t vmul(const vuint32mf2_t & op1, uint32_t op2, size_t vl)
{
return vmul_vx_u32mf2(op1, op2, vl);
}
inline static vuint32mf2_t vreinterpret_u32mf2(const vint32mf2_t& val)
{
return vreinterpret_v_i32mf2_u32mf2(val);
}
inline static vuint32mf2_t vreinterpret_u32mf2(const vuint16mf2_t& val)
{
return vreinterpret_v_u16mf2_u32mf2(val);
}
#endif //OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP

File diff suppressed because it is too large Load Diff

View File

@ -304,8 +304,8 @@ public:
opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
else
#endif
#if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
if( useRVV)
#if CV_TRY_RVV && CV_RVV
if( useRVV )
opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
else
#endif

View File

@ -1257,7 +1257,7 @@ void fastGEMM1T( const int8_t* vec, const int8_t* weights,
}
#endif // CV_LASX
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV
static const size_t __cv_rvv_e8m1_max = __riscv_vsetvlmax_e8m1();
static const size_t __cv_rvv_e16m1_max = __riscv_vsetvlmax_e16m1();

View File

@ -1667,7 +1667,7 @@ public:
opt_AVX::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
else
#endif
#if CV_TRY_RVV
#if CV_TRY_RVV && CV_RVV
if( useRVV ) {
opt_RVV::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
}

View File

@ -119,7 +119,7 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& c
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
else
#endif
#if CV_TRY_RVV
#if CV_TRY_RVV && CV_RVV
if(canRunOpt && conv->useRVV)
opt_RVV::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);

View File

@ -264,48 +264,48 @@ void fastDepthwiseConv( const float* wptr,
if( stride_w == 1 )
for( ; out_j < outW1; out_j += vl, avl -= vl)
{
vl = vsetvl_e32m8(avl);
vl = __riscv_vsetvl_e32m8(avl);
int in_j = out_j * stride_w - pad_l;
vfloat32m8_t vout0 = vfmacc_vf_f32m8(vfmv_v_f_f32m8(bias, vl), w00, vle32_v_f32m8(imgptr0 + in_j, vl), vl);
vout0 = vfmacc_vf_f32m8(vout0, w01, vle32_v_f32m8(imgptr0 + in_j + dilation_w, vl), vl);
vout0 = vfmacc_vf_f32m8(vout0, w02, vle32_v_f32m8(imgptr0 + in_j + dilation_w*2, vl), vl);
vout0 = vfmacc_vf_f32m8(vout0, w10, vle32_v_f32m8(imgptr1 + in_j, vl),vl);
vout0 = vfmacc_vf_f32m8(vout0, w11, vle32_v_f32m8(imgptr1 + in_j + dilation_w, vl),vl);
vout0 = vfmacc_vf_f32m8(vout0, w12, vle32_v_f32m8(imgptr1 + in_j + dilation_w*2, vl),vl);
vout0 = vfmacc_vf_f32m8(vout0, w20, vle32_v_f32m8(imgptr2 + in_j, vl), vl);
vout0 = vfmacc_vf_f32m8(vout0, w21, vle32_v_f32m8(imgptr2 + in_j + dilation_w, vl), vl);
vout0 = vfmacc_vf_f32m8(vout0, w22, vle32_v_f32m8(imgptr2 + in_j + dilation_w*2, vl), vl);
vfloat32m8_t vout0 = __riscv_vfmacc_vf_f32m8(__riscv_vfmv_v_f_f32m8(bias, vl), w00, __riscv_vle32_v_f32m8(imgptr0 + in_j, vl), vl);
vout0 = __riscv_vfmacc_vf_f32m8(vout0, w01, __riscv_vle32_v_f32m8(imgptr0 + in_j + dilation_w, vl), vl);
vout0 = __riscv_vfmacc_vf_f32m8(vout0, w02, __riscv_vle32_v_f32m8(imgptr0 + in_j + dilation_w*2, vl), vl);
vout0 = __riscv_vfmacc_vf_f32m8(vout0, w10, __riscv_vle32_v_f32m8(imgptr1 + in_j, vl),vl);
vout0 = __riscv_vfmacc_vf_f32m8(vout0, w11, __riscv_vle32_v_f32m8(imgptr1 + in_j + dilation_w, vl),vl);
vout0 = __riscv_vfmacc_vf_f32m8(vout0, w12, __riscv_vle32_v_f32m8(imgptr1 + in_j + dilation_w*2, vl),vl);
vout0 = __riscv_vfmacc_vf_f32m8(vout0, w20, __riscv_vle32_v_f32m8(imgptr2 + in_j, vl), vl);
vout0 = __riscv_vfmacc_vf_f32m8(vout0, w21, __riscv_vle32_v_f32m8(imgptr2 + in_j + dilation_w, vl), vl);
vout0 = __riscv_vfmacc_vf_f32m8(vout0, w22, __riscv_vle32_v_f32m8(imgptr2 + in_j + dilation_w*2, vl), vl);
if (relu)
{
vbool4_t m = vmfgt_vf_f32m8_b4(vout0, 0, vl);
vout0 = vmerge_vvm_f32m8(m, vfmul_vf_f32m8(vout0, relu_coeff, vl), vout0, vl);
vbool4_t m = __riscv_vmfgt_vf_f32m8_b4(vout0, 0, vl);
vout0 = __riscv_vmerge_vvm_f32m8(__riscv_vfmul_vf_f32m8(vout0, relu_coeff, vl), vout0, m, vl);
}
vse32_v_f32m8(outptr + out_j, vout0, vl);
__riscv_vse32_v_f32m8(outptr + out_j, vout0, vl);
}
else //stride_w == 2 && dilation_w == 1
for( ; out_j < outW1; out_j += vl, avl -= vl)
{
vl = vsetvl_e32m2(avl);
vl = __riscv_vsetvl_e32m2(avl);
int in_j = out_j * stride_w - pad_l;
vfloat32m2_t vout0 = vfmacc_vf_f32m2(vfmv_v_f_f32m2(bias, vl), w00, vlse32_v_f32m2(imgptr0+in_j , 8, vl), vl);
vfloat32m2_t vout1 = vfmul_vf_f32m2(vlse32_v_f32m2(imgptr0+in_j+1, 8, vl), w01, vl);
vfloat32m2_t vout2 = vfmul_vf_f32m2(vlse32_v_f32m2(imgptr0+in_j+2, 8, vl), w02, vl);
vfloat32m2_t vout0 = __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(bias, vl), w00, __riscv_vlse32_v_f32m2(imgptr0+in_j , 8, vl), vl);
vfloat32m2_t vout1 = __riscv_vfmul_vf_f32m2(__riscv_vlse32_v_f32m2(imgptr0+in_j+1, 8, vl), w01, vl);
vfloat32m2_t vout2 = __riscv_vfmul_vf_f32m2(__riscv_vlse32_v_f32m2(imgptr0+in_j+2, 8, vl), w02, vl);
vout0 = vfmacc_vf_f32m2(vout0, w10, vlse32_v_f32m2(imgptr1+in_j , 8, vl), vl);
vout1 = vfmacc_vf_f32m2(vout1, w11, vlse32_v_f32m2(imgptr1+in_j+1, 8, vl), vl);
vout2 = vfmacc_vf_f32m2(vout2, w12, vlse32_v_f32m2(imgptr1+in_j+2, 8, vl), vl);
vout0 = __riscv_vfmacc_vf_f32m2(vout0, w10, __riscv_vlse32_v_f32m2(imgptr1+in_j , 8, vl), vl);
vout1 = __riscv_vfmacc_vf_f32m2(vout1, w11, __riscv_vlse32_v_f32m2(imgptr1+in_j+1, 8, vl), vl);
vout2 = __riscv_vfmacc_vf_f32m2(vout2, w12, __riscv_vlse32_v_f32m2(imgptr1+in_j+2, 8, vl), vl);
vout0 = vfmacc_vf_f32m2(vout0, w20, vlse32_v_f32m2(imgptr2+in_j , 8, vl), vl);
vout1 = vfmacc_vf_f32m2(vout1, w21, vlse32_v_f32m2(imgptr2+in_j+1, 8, vl), vl);
vout2 = vfmacc_vf_f32m2(vout2, w22, vlse32_v_f32m2(imgptr2+in_j+2, 8, vl), vl);
vout0 = __riscv_vfmacc_vf_f32m2(vout0, w20, __riscv_vlse32_v_f32m2(imgptr2+in_j , 8, vl), vl);
vout1 = __riscv_vfmacc_vf_f32m2(vout1, w21, __riscv_vlse32_v_f32m2(imgptr2+in_j+1, 8, vl), vl);
vout2 = __riscv_vfmacc_vf_f32m2(vout2, w22, __riscv_vlse32_v_f32m2(imgptr2+in_j+2, 8, vl), vl);
vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
vout0 = __riscv_vfadd_vv_f32m2(__riscv_vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
if (relu)
{
vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl);
vbool16_t m = __riscv_vmfgt_vf_f32m2_b16(vout0, 0, vl);
vout0 = __riscv_vmerge_vvm_f32m2(__riscv_vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, m, vl);
}
vse32_v_f32m2(outptr + out_j, vout0, vl);
__riscv_vse32_v_f32m2(outptr + out_j, vout0, vl);
}
}

View File

@ -277,7 +277,7 @@ public:
opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize_aligned);
else
#endif
#if CV_TRY_RVV
#if CV_TRY_RVV && CV_RVV
if( useRVV )
opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
else

View File

@ -295,7 +295,7 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr,
int avl = nb, vl;
for(int n = 0; n < nb; n += vl, avl -= vl)
{
vl = vsetvl_e32m4(avl);
vl = __riscv_vsetvl_e32m4(avl);
for( int m = 0; m < ma; m += 7 )
{
const float* aptr0 = aptr + astep*m;
@ -314,13 +314,13 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr,
float* cptr5 = cptr + cstep*std::min(m+5, ma-1);
float* cptr6 = cptr + cstep*std::min(m+6, ma-1);
vfloat32m4_t d0 = vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d1 = vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d2 = vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d3 = vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d4 = vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d5 = vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d6 = vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d0 = __riscv_vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d1 = __riscv_vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d2 = __riscv_vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d3 = __riscv_vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d4 = __riscv_vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d5 = __riscv_vfmv_v_f_f32m4(0, vl);
vfloat32m4_t d6 = __riscv_vfmv_v_f_f32m4(0, vl);
for( int k = 0; k < na; k++ )
{
@ -332,22 +332,22 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr,
float a5 = aptr5[k];
float a6 = aptr6[k];
vfloat32m4_t b = vle32_v_f32m4(bptr + k*bstep + n, vl);
d0 = vfmacc_vf_f32m4(d0, a0, b, vl);
d1 = vfmacc_vf_f32m4(d1, a1, b, vl);
d2 = vfmacc_vf_f32m4(d2, a2, b, vl);
d3 = vfmacc_vf_f32m4(d3, a3, b, vl);
d4 = vfmacc_vf_f32m4(d4, a4, b, vl);
d5 = vfmacc_vf_f32m4(d5, a5, b, vl);
d6 = vfmacc_vf_f32m4(d6, a6, b, vl);
vfloat32m4_t b = __riscv_vle32_v_f32m4(bptr + k*bstep + n, vl);
d0 = __riscv_vfmacc_vf_f32m4(d0, a0, b, vl);
d1 = __riscv_vfmacc_vf_f32m4(d1, a1, b, vl);
d2 = __riscv_vfmacc_vf_f32m4(d2, a2, b, vl);
d3 = __riscv_vfmacc_vf_f32m4(d3, a3, b, vl);
d4 = __riscv_vfmacc_vf_f32m4(d4, a4, b, vl);
d5 = __riscv_vfmacc_vf_f32m4(d5, a5, b, vl);
d6 = __riscv_vfmacc_vf_f32m4(d6, a6, b, vl);
}
vse32_v_f32m4(cptr0 + n, d0, vl);
vse32_v_f32m4(cptr1 + n, d1, vl);
vse32_v_f32m4(cptr2 + n, d2, vl);
vse32_v_f32m4(cptr3 + n, d3, vl);
vse32_v_f32m4(cptr4 + n, d4, vl);
vse32_v_f32m4(cptr5 + n, d5, vl);
vse32_v_f32m4(cptr6 + n, d6, vl);
__riscv_vse32_v_f32m4(cptr0 + n, d0, vl);
__riscv_vse32_v_f32m4(cptr1 + n, d1, vl);
__riscv_vse32_v_f32m4(cptr2 + n, d2, vl);
__riscv_vse32_v_f32m4(cptr3 + n, d3, vl);
__riscv_vse32_v_f32m4(cptr4 + n, d4, vl);
__riscv_vse32_v_f32m4(cptr5 + n, d5, vl);
__riscv_vse32_v_f32m4(cptr6 + n, d6, vl);
}
}
}
@ -356,112 +356,112 @@ void fastGEMM1T( const float* vec, const float* weights,
size_t wstep, const float* bias,
float* dst, int nvecs, int vecsize )
{
const int vlm2 = vsetvlmax_e32m2();
const int vlm2 = __riscv_vsetvlmax_e32m2();
int i = 0;
for( ; i <= nvecs - 15; i += 15 )
{
const float* wptr = weights + i*wstep;
vfloat32m2_t
vs0 = vfmv_v_f_f32m2(0, vlm2), vs1 = vfmv_v_f_f32m2(0, vlm2), vs2 = vfmv_v_f_f32m2(0, vlm2),
vs3 = vfmv_v_f_f32m2(0, vlm2), vs4 = vfmv_v_f_f32m2(0, vlm2), vs5 = vfmv_v_f_f32m2(0, vlm2),
vs6 = vfmv_v_f_f32m2(0, vlm2), vs7 = vfmv_v_f_f32m2(0, vlm2), vs8 = vfmv_v_f_f32m2(0, vlm2),
vs9 = vfmv_v_f_f32m2(0, vlm2), vs10 = vfmv_v_f_f32m2(0, vlm2), vs11 = vfmv_v_f_f32m2(0, vlm2),
vs12 = vfmv_v_f_f32m2(0, vlm2), vs13 = vfmv_v_f_f32m2(0, vlm2), vs14 = vfmv_v_f_f32m2(0, vlm2);
vs0 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs1 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs2 = __riscv_vfmv_v_f_f32m2(0, vlm2),
vs3 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs4 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs5 = __riscv_vfmv_v_f_f32m2(0, vlm2),
vs6 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs7 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs8 = __riscv_vfmv_v_f_f32m2(0, vlm2),
vs9 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs10 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs11 = __riscv_vfmv_v_f_f32m2(0, vlm2),
vs12 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs13 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs14 = __riscv_vfmv_v_f_f32m2(0, vlm2);
int avl = vecsize, vl;
for(int k = 0 ; k < vecsize; k += vl, wptr += vl, avl -= vl)
{
vl = vsetvl_e32m2(avl);
vfloat32m2_t v = vle32_v_f32m2(vec + k, vl);
vs0 = vfmacc_vv_f32m2(vs0, vle32_v_f32m2(wptr, vl), v, vl);
vs1 = vfmacc_vv_f32m2(vs1, vle32_v_f32m2(wptr + wstep, vl), v, vl);
vs2 = vfmacc_vv_f32m2(vs2, vle32_v_f32m2(wptr + wstep*2, vl), v, vl);
vs3 = vfmacc_vv_f32m2(vs3, vle32_v_f32m2(wptr + wstep*3, vl), v, vl);
vs4 = vfmacc_vv_f32m2(vs4, vle32_v_f32m2(wptr + wstep*4, vl), v, vl);
vs5 = vfmacc_vv_f32m2(vs5, vle32_v_f32m2(wptr + wstep*5, vl), v, vl);
vs6 = vfmacc_vv_f32m2(vs6, vle32_v_f32m2(wptr + wstep*6, vl), v, vl);
vs7 = vfmacc_vv_f32m2(vs7, vle32_v_f32m2(wptr + wstep*7, vl), v, vl);
vs8 = vfmacc_vv_f32m2(vs8, vle32_v_f32m2(wptr + wstep*8, vl), v, vl);
vs9 = vfmacc_vv_f32m2(vs9, vle32_v_f32m2(wptr + wstep*9, vl), v, vl);
vs10 = vfmacc_vv_f32m2(vs10, vle32_v_f32m2(wptr + wstep*10, vl), v, vl);
vs11 = vfmacc_vv_f32m2(vs11, vle32_v_f32m2(wptr + wstep*11, vl), v, vl);
vs12 = vfmacc_vv_f32m2(vs12, vle32_v_f32m2(wptr + wstep*12, vl), v, vl);
vs13 = vfmacc_vv_f32m2(vs13, vle32_v_f32m2(wptr + wstep*13, vl), v, vl);
vs14 = vfmacc_vv_f32m2(vs14, vle32_v_f32m2(wptr + wstep*14, vl), v, vl);
vl = __riscv_vsetvl_e32m2(avl);
vfloat32m2_t v = __riscv_vle32_v_f32m2(vec + k, vl);
vs0 = __riscv_vfmacc_vv_f32m2(vs0, __riscv_vle32_v_f32m2(wptr, vl), v, vl);
vs1 = __riscv_vfmacc_vv_f32m2(vs1, __riscv_vle32_v_f32m2(wptr + wstep, vl), v, vl);
vs2 = __riscv_vfmacc_vv_f32m2(vs2, __riscv_vle32_v_f32m2(wptr + wstep*2, vl), v, vl);
vs3 = __riscv_vfmacc_vv_f32m2(vs3, __riscv_vle32_v_f32m2(wptr + wstep*3, vl), v, vl);
vs4 = __riscv_vfmacc_vv_f32m2(vs4, __riscv_vle32_v_f32m2(wptr + wstep*4, vl), v, vl);
vs5 = __riscv_vfmacc_vv_f32m2(vs5, __riscv_vle32_v_f32m2(wptr + wstep*5, vl), v, vl);
vs6 = __riscv_vfmacc_vv_f32m2(vs6, __riscv_vle32_v_f32m2(wptr + wstep*6, vl), v, vl);
vs7 = __riscv_vfmacc_vv_f32m2(vs7, __riscv_vle32_v_f32m2(wptr + wstep*7, vl), v, vl);
vs8 = __riscv_vfmacc_vv_f32m2(vs8, __riscv_vle32_v_f32m2(wptr + wstep*8, vl), v, vl);
vs9 = __riscv_vfmacc_vv_f32m2(vs9, __riscv_vle32_v_f32m2(wptr + wstep*9, vl), v, vl);
vs10 = __riscv_vfmacc_vv_f32m2(vs10, __riscv_vle32_v_f32m2(wptr + wstep*10, vl), v, vl);
vs11 = __riscv_vfmacc_vv_f32m2(vs11, __riscv_vle32_v_f32m2(wptr + wstep*11, vl), v, vl);
vs12 = __riscv_vfmacc_vv_f32m2(vs12, __riscv_vle32_v_f32m2(wptr + wstep*12, vl), v, vl);
vs13 = __riscv_vfmacc_vv_f32m2(vs13, __riscv_vle32_v_f32m2(wptr + wstep*13, vl), v, vl);
vs14 = __riscv_vfmacc_vv_f32m2(vs14, __riscv_vle32_v_f32m2(wptr + wstep*14, vl), v, vl);
}
// Calculate the sum of each vector
float sum[15];
vfloat32m1_t zero = vfmv_v_f_f32m1(0, vlm2);
sum[0] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs0, zero, vlm2));
sum[1] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs1, zero, vlm2));
sum[2] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs2, zero, vlm2));
sum[3] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs3, zero, vlm2));
sum[4] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs4, zero, vlm2));
sum[5] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs5, zero, vlm2));
sum[6] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs6, zero, vlm2));
sum[7] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs7, zero, vlm2));
sum[8] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs8, zero, vlm2));
sum[9] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs9, zero, vlm2));
sum[10] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs10, zero, vlm2));
sum[11] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs11, zero, vlm2));
sum[12] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs12, zero, vlm2));
sum[13] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs13, zero, vlm2));
sum[14] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs14, zero, vlm2));
vfloat32m1_t zero = __riscv_vfmv_v_f_f32m1(0, vlm2);
sum[0] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs0, zero, vlm2));
sum[1] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs1, zero, vlm2));
sum[2] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs2, zero, vlm2));
sum[3] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs3, zero, vlm2));
sum[4] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs4, zero, vlm2));
sum[5] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs5, zero, vlm2));
sum[6] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs6, zero, vlm2));
sum[7] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs7, zero, vlm2));
sum[8] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs8, zero, vlm2));
sum[9] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs9, zero, vlm2));
sum[10] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs10, zero, vlm2));
sum[11] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs11, zero, vlm2));
sum[12] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs12, zero, vlm2));
sum[13] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs13, zero, vlm2));
sum[14] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs14, zero, vlm2));
vfloat32m4_t s0 = vfadd_vv_f32m4(vle32_v_f32m4(sum, 15), vle32_v_f32m4(bias + i, 15), 15);
vse32_v_f32m4(dst + i, s0, 15);
vfloat32m4_t s0 = __riscv_vfadd_vv_f32m4(__riscv_vle32_v_f32m4(sum, 15), __riscv_vle32_v_f32m4(bias + i, 15), 15);
__riscv_vse32_v_f32m4(dst + i, s0, 15);
}
int unroll_tail = nvecs - i;
if (unroll_tail > 0)
{
const float* wptr = weights + i*wstep;
vfloat32m2_t
vs0 = vfmv_v_f_f32m2(0, vlm2), vs1 = vfmv_v_f_f32m2(0, vlm2), vs2 = vfmv_v_f_f32m2(0, vlm2),
vs3 = vfmv_v_f_f32m2(0, vlm2), vs4 = vfmv_v_f_f32m2(0, vlm2), vs5 = vfmv_v_f_f32m2(0, vlm2),
vs6 = vfmv_v_f_f32m2(0, vlm2), vs7 = vfmv_v_f_f32m2(0, vlm2), vs8 = vfmv_v_f_f32m2(0, vlm2),
vs9 = vfmv_v_f_f32m2(0, vlm2), vs10 = vfmv_v_f_f32m2(0, vlm2), vs11 = vfmv_v_f_f32m2(0, vlm2),
vs12 = vfmv_v_f_f32m2(0, vlm2), vs13 = vfmv_v_f_f32m2(0, vlm2);
vs0 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs1 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs2 = __riscv_vfmv_v_f_f32m2(0, vlm2),
vs3 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs4 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs5 = __riscv_vfmv_v_f_f32m2(0, vlm2),
vs6 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs7 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs8 = __riscv_vfmv_v_f_f32m2(0, vlm2),
vs9 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs10 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs11 = __riscv_vfmv_v_f_f32m2(0, vlm2),
vs12 = __riscv_vfmv_v_f_f32m2(0, vlm2), vs13 = __riscv_vfmv_v_f_f32m2(0, vlm2);
int avl = vecsize, vl;
for(int k = 0; k < vecsize; k += vl, wptr += vl, avl -= vl)
{
vl = vsetvl_e32m2(avl);
vfloat32m2_t v = vle32_v_f32m2(vec + k, vl);
vs0 = vfmacc_vv_f32m2(vs0, vle32_v_f32m2(wptr, vl), v, vl);
vs1 = vfmacc_vv_f32m2(vs1, vle32_v_f32m2(wptr + wstep*std::min(1, unroll_tail-1), vl), v, vl);
vs2 = vfmacc_vv_f32m2(vs2, vle32_v_f32m2(wptr + wstep*std::min(2, unroll_tail-1), vl), v, vl);
vs3 = vfmacc_vv_f32m2(vs3, vle32_v_f32m2(wptr + wstep*std::min(3, unroll_tail-1), vl), v, vl);
vs4 = vfmacc_vv_f32m2(vs4, vle32_v_f32m2(wptr + wstep*std::min(4, unroll_tail-1), vl), v, vl);
vs5 = vfmacc_vv_f32m2(vs5, vle32_v_f32m2(wptr + wstep*std::min(5, unroll_tail-1), vl), v, vl);
vs6 = vfmacc_vv_f32m2(vs6, vle32_v_f32m2(wptr + wstep*std::min(6, unroll_tail-1), vl), v, vl);
vs7 = vfmacc_vv_f32m2(vs7, vle32_v_f32m2(wptr + wstep*std::min(7, unroll_tail-1), vl), v, vl);
vs8 = vfmacc_vv_f32m2(vs8, vle32_v_f32m2(wptr + wstep*std::min(8, unroll_tail-1), vl), v, vl);
vs9 = vfmacc_vv_f32m2(vs9, vle32_v_f32m2(wptr + wstep*std::min(9, unroll_tail-1), vl), v, vl);
vs10 = vfmacc_vv_f32m2(vs10, vle32_v_f32m2(wptr + wstep*std::min(10, unroll_tail-1), vl), v, vl);
vs11 = vfmacc_vv_f32m2(vs11, vle32_v_f32m2(wptr + wstep*std::min(11, unroll_tail-1), vl), v, vl);
vs12 = vfmacc_vv_f32m2(vs12, vle32_v_f32m2(wptr + wstep*std::min(12, unroll_tail-1), vl), v, vl);
vs13 = vfmacc_vv_f32m2(vs13, vle32_v_f32m2(wptr + wstep*std::min(13, unroll_tail-1), vl), v, vl);
vl = __riscv_vsetvl_e32m2(avl);
vfloat32m2_t v = __riscv_vle32_v_f32m2(vec + k, vl);
vs0 = __riscv_vfmacc_vv_f32m2(vs0, __riscv_vle32_v_f32m2(wptr, vl), v, vl);
vs1 = __riscv_vfmacc_vv_f32m2(vs1, __riscv_vle32_v_f32m2(wptr + wstep*std::min(1, unroll_tail-1), vl), v, vl);
vs2 = __riscv_vfmacc_vv_f32m2(vs2, __riscv_vle32_v_f32m2(wptr + wstep*std::min(2, unroll_tail-1), vl), v, vl);
vs3 = __riscv_vfmacc_vv_f32m2(vs3, __riscv_vle32_v_f32m2(wptr + wstep*std::min(3, unroll_tail-1), vl), v, vl);
vs4 = __riscv_vfmacc_vv_f32m2(vs4, __riscv_vle32_v_f32m2(wptr + wstep*std::min(4, unroll_tail-1), vl), v, vl);
vs5 = __riscv_vfmacc_vv_f32m2(vs5, __riscv_vle32_v_f32m2(wptr + wstep*std::min(5, unroll_tail-1), vl), v, vl);
vs6 = __riscv_vfmacc_vv_f32m2(vs6, __riscv_vle32_v_f32m2(wptr + wstep*std::min(6, unroll_tail-1), vl), v, vl);
vs7 = __riscv_vfmacc_vv_f32m2(vs7, __riscv_vle32_v_f32m2(wptr + wstep*std::min(7, unroll_tail-1), vl), v, vl);
vs8 = __riscv_vfmacc_vv_f32m2(vs8, __riscv_vle32_v_f32m2(wptr + wstep*std::min(8, unroll_tail-1), vl), v, vl);
vs9 = __riscv_vfmacc_vv_f32m2(vs9, __riscv_vle32_v_f32m2(wptr + wstep*std::min(9, unroll_tail-1), vl), v, vl);
vs10 = __riscv_vfmacc_vv_f32m2(vs10, __riscv_vle32_v_f32m2(wptr + wstep*std::min(10, unroll_tail-1), vl), v, vl);
vs11 = __riscv_vfmacc_vv_f32m2(vs11, __riscv_vle32_v_f32m2(wptr + wstep*std::min(11, unroll_tail-1), vl), v, vl);
vs12 = __riscv_vfmacc_vv_f32m2(vs12, __riscv_vle32_v_f32m2(wptr + wstep*std::min(12, unroll_tail-1), vl), v, vl);
vs13 = __riscv_vfmacc_vv_f32m2(vs13, __riscv_vle32_v_f32m2(wptr + wstep*std::min(13, unroll_tail-1), vl), v, vl);
}
// Calculate the sum of each vector
float sum[14];
vfloat32m1_t zero = vfmv_v_f_f32m1(0, vlm2);
sum[0] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs0, zero, vlm2));
sum[1] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs1, zero, vlm2));
sum[2] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs2, zero, vlm2));
sum[3] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs3, zero, vlm2));
sum[4] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs4, zero, vlm2));
sum[5] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs5, zero, vlm2));
sum[6] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs6, zero, vlm2));
sum[7] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs7, zero, vlm2));
sum[8] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs8, zero, vlm2));
sum[9] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs9, zero, vlm2));
sum[10] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs10, zero, vlm2));
sum[11] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs11, zero, vlm2));
sum[12] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs12, zero, vlm2));
sum[13] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m2_f32m1(zero, vs13, zero, vlm2));
vfloat32m1_t zero = __riscv_vfmv_v_f_f32m1(0, vlm2);
sum[0] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs0, zero, vlm2));
sum[1] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs1, zero, vlm2));
sum[2] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs2, zero, vlm2));
sum[3] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs3, zero, vlm2));
sum[4] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs4, zero, vlm2));
sum[5] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs5, zero, vlm2));
sum[6] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs6, zero, vlm2));
sum[7] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs7, zero, vlm2));
sum[8] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs8, zero, vlm2));
sum[9] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs9, zero, vlm2));
sum[10] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs10, zero, vlm2));
sum[11] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs11, zero, vlm2));
sum[12] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs12, zero, vlm2));
sum[13] = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(vs13, zero, vlm2));
vfloat32m4_t s0 = vfadd_vv_f32m4(vle32_v_f32m4(sum, unroll_tail), vle32_v_f32m4(bias + i, unroll_tail), unroll_tail);
vse32_v_f32m4(dst + i, s0, unroll_tail);
vfloat32m4_t s0 = __riscv_vfadd_vv_f32m4(__riscv_vle32_v_f32m4(sum, unroll_tail), __riscv_vle32_v_f32m4(bias + i, unroll_tail), unroll_tail);
__riscv_vse32_v_f32m4(dst + i, s0, unroll_tail);
}
}