Merge pull request #25743 from hanliutong:rvv-fp16

Add FP16 support for RISC-V
2024-11-24 11:10:21 +08:00 · 2024-08-23 15:29:21 +03:00 · 2024-08-23 15:29:21 +03:00 · 41097a48ad
commit 41097a48ad
parent 050085c996 f3cc5a9e1e
10 changed files with 585 additions and 117 deletions
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -52,7 +52,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SK
 list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD NEON_FP16 NEON_BF16)
 list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
 list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
-list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
+list(APPEND CPU_ALL_OPTIMIZATIONS RVV FP16 RVV_ZVFH)
 list(APPEND CPU_ALL_OPTIMIZATIONS LSX)
 list(APPEND CPU_ALL_OPTIMIZATIONS LASX)
 list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
@ -170,6 +170,21 @@ elseif(" ${CMAKE_CXX_FLAGS} " MATCHES " -march=native | -xHost | /QxHost ")
  set(CPU_BASELINE_DETECT ON)
 endif()

+# This macro traverses all the dependent (IMPLIES) backends for the CPU_${OPT}_FLAGS_ON.
+macro(ocv_cpu_riscv_update_flag FEATURE_NAME_LIST COMMON_OPTION)
+  foreach(OPT IN LISTS ${FEATURE_NAME_LIST})
+    unset(APPEND_TRAILING)
+    # traverse all dependency and merge extensions to a flag.
+    foreach(IMPLIE IN LISTS CPU_${OPT}_IMPLIES)
+      string(APPEND APPEND_TRAILING "_${CPU_${IMPLIE}_FLAG}")
+    endforeach()
+    string(APPEND APPEND_TRAILING "_${CPU_${OPT}_FLAG}")
+    # Update flag
+    set(CPU_${OPT}_FLAGS_ON "${COMMON_OPTION}${APPEND_TRAILING}")
+    message(STATUS "CPU_${OPT}_FLAGS_ON is ${CPU_${OPT}_FLAGS_ON}")
+  endforeach()
+endmacro()
+
 if(X86 OR X86_64)
  ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;AVX;FP16;AVX2;FMA3;AVX_512F;AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")

@ -390,12 +405,28 @@ elseif(PPC64LE)
  set(CPU_BASELINE "VSX" CACHE STRING "${HELP_CPU_BASELINE}")

 elseif(RISCV)
+  if(NOT DEFINED PLATFORM_STR)
+    set(PLATFORM_STR "rv64gc")
+  endif()
+
+  ocv_update(CPU_KNOWN_OPTIMIZATIONS "RVV;FP16;RVV_ZVFH")
  ocv_update(CPU_RVV_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_rvv.cpp")
-  ocv_update(CPU_KNOWN_OPTIMIZATIONS "RVV")
-  ocv_update(CPU_RVV_FLAGS_ON "-march=rv64gcv")
+  ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
+  ocv_update(CPU_RVV_ZVFH_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_rvv_fp16.cpp")
+  ocv_update(CPU_RVV_ZVFH_IMPLIES "RVV;FP16")
+  ocv_update(CPU_FP16_IMPLIES "RVV")
+  set(CPU_RVV_FLAG "v")
+  set(CPU_FP16_FLAG "zvfhmin")
+  set(CPU_RVV_ZVFH_FLAG "zvfh")
+  set(BASE_ARCHITECTURE "-march=${PLATFORM_STR}")
+  ocv_cpu_riscv_update_flag(CPU_KNOWN_OPTIMIZATIONS ${BASE_ARCHITECTURE})
+
  ocv_update(CPU_RVV_FLAGS_CONFLICT "-march=[^ ]*")

-  set(CPU_DISPATCH "" CACHE STRING "${HELP_CPU_DISPATCH}")
+  if(NOT ${BUILD_SHARED_LIBS}) # static build for k230
+    add_extra_compiler_option("-static -static-libgcc -static-libstdc++")
+  endif()
+  set(CPU_DISPATCH "FP16;RVV_ZVFH" CACHE STRING "${HELP_CPU_DISPATCH}")
  set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")

 elseif(LOONGARCH64)
@ -495,6 +526,32 @@ macro(ocv_cpu_aarch64_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRI
  endif()
 endmacro()

+macro(ocv_cpu_riscv_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRING COMMON_OPTION)
+  unset(_POSTFIX)
+  unset(APPEND_TRAILING)
+  # Check each feature option.
+  foreach(OPT IN LISTS ${FEATURE_NAME_LIST})
+    string(FIND "${${FLAG_STRING}}" "${CPU_${OPT}_FLAGS_ON}" OPT_FOUND)
+    if(NOT ${OPT_FOUND} EQUAL -1)
+      # e.g. when ${CPU_${OPT}_FLAGS_ON} is "rv64gc_v_zvfhmin"
+      # the ${TRAILING_PART} will be "_v_zvfhmin"
+      # and the ${parts} will be "_v;_zvfhmin" (a list)
+      string(REPLACE "${COMMON_OPTION}" "" TRAILING_PART "${CPU_${OPT}_FLAGS_ON}")
+      string(REGEX MATCHALL "_[^_]+" parts ${TRAILING_PART})
+      list(APPEND _POSTFIX ${parts})
+      # remove ${CPU_${OPT}_FLAGS_ON} from ${FLAG_STRING}
+      string(REGEX REPLACE "${CPU_${OPT}_FLAGS_ON}( |$)" "" ${FLAG_STRING} ${${FLAG_STRING}})
+    endif()
+  endforeach()
+  # Remove the duplicate extensions. (e.g. _v, _v, ...)
+  list(REMOVE_DUPLICATES _POSTFIX)
+  # Merge to one extensions flag
+  foreach(TRAILING IN LISTS _POSTFIX)
+    string(APPEND APPEND_TRAILING "${TRAILING}")
+  endforeach()
+  set(${FLAG_STRING} "${${FLAG_STRING}} ${COMMON_OPTION}${APPEND_TRAILING}")
+endmacro()
+
 foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
  set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "")
  if("${CPU_${OPT}_FLAGS_ON}" STREQUAL "disabled")
@ -597,6 +654,11 @@ if(AARCH64)
  endif()
 endif()

+if(RISCV)
+    string(STRIP "${CPU_BASELINE_FLAGS}" CPU_BASELINE_FLAGS)
+    ocv_cpu_riscv_baseline_merge_feature_options(CPU_KNOWN_OPTIMIZATIONS CPU_BASELINE_FLAGS ${BASE_ARCHITECTURE})
+endif()
+
 foreach(OPT ${CPU_BASELINE_REQUIRE})
  if(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
    message(SEND_ERROR "Required baseline optimization is not supported: ${OPT} (CPU_BASELINE_REQUIRE=${CPU_BASELINE_REQUIRE})")
--- a/cmake/checks/cpu_fp16.cpp
+++ b/cmake/checks/cpu_fp16.cpp
@ -23,6 +23,27 @@ int test()
    *(float16x4_t*)dst = v_dst;
    return (int)dst[0];
 }
+#elif (defined __riscv_zvfhmin && __riscv_zvfhmin) || (defined __riscv_zvfh && __riscv_zvfh)
+#include <riscv_vector.h>
+
+int test()
+{
+    const _Float16 input1[] = {0.5f, 1.5f, 2.5f, 3.5f};
+    const float input2[] = {-0.5f, -1.5f, -2.5f, -3.5f};
+    short dst[4];
+
+    size_t vl =  __riscv_vsetvl_e16m1(4);
+
+    vfloat16m1_t in_f16 = __riscv_vle16_v_f16m1(input1, vl);
+    vfloat32m2_t in_f32 = __riscv_vle32_v_f32m2(input2, vl);
+
+    vfloat32m2_t cvt_f32 = __riscv_vfwcvt_f_f_v_f32m2(in_f16, vl);
+    vfloat32m2_t res_f32 = __riscv_vfadd(in_f32, cvt_f32, vl);
+    vfloat16m1_t res_f16 = __riscv_vfncvt_f_f_w_f16m1(res_f32, vl);
+
+    __riscv_vse16_v_f16m1((_Float16*)dst, res_f16, vl);
+    return (int)dst[0];
+}
 #else
 #error "FP16 is not supported"
 #endif
--- a/cmake/checks/cpu_rvv_fp16.cpp
+++ b/cmake/checks/cpu_rvv_fp16.cpp
@ -0,0 +1,25 @@
+#include <stdio.h>
+
+#if defined(__riscv) && __riscv && defined (__riscv_zvfh) && __riscv_zvfh
+#  include <riscv_vector.h>
+
+int test()
+{
+    const _Float16 input1[] = {0.5f, 1.5f, 2.5f, 3.5f};
+    const _Float16 input2[] = {-0.5f, -1.5f, -2.5f, -3.5f};
+
+    size_t vl =  __riscv_vsetvl_e16m1(4);
+    vfloat16m1_t vec1 = __riscv_vle16_v_f16m1(input1, vl);
+    vfloat16m1_t vec2 = __riscv_vle16_v_f16m1(input2, vl);
+    vfloat16m1_t result = __riscv_vfadd_vv_f16m1(vec1, vec2, vl);
+    return (int)__riscv_vfmv_f_s_f16m1_f16(result);
+}
+#else
+#error "RISC-V Vector Extension with Half-Precision Floating-Point (zvfh) is not supported"
+#endif
+
+int main()
+{
+  printf("%d\n", test());
+  return 0;
+}
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@ -74,6 +74,8 @@
 #ifdef CV_CPU_COMPILE_FP16
 #  if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64)
 #    include <arm_neon.h>
+#  elif defined(__riscv_vector)
+#    include <riscv_vector.h>
 #  else
 #    include <immintrin.h>
 #  endif
@ -250,6 +252,11 @@ struct VZeroUpperGuard {
 #  define CV_FP16 1
 #endif

+#if defined(__riscv_zvfhmin) && __riscv_zvfhmin || (defined(__riscv_zvfh) && __riscv_zvfh)
+#  include <riscv_vector.h>
+#  define CV_FP16 1
+#endif
+
 #endif // !__OPENCV_BUILD && !__CUDACC (Compatibility code)


--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@ -567,6 +567,27 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...)  CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_RVV_ZVFH
+#  define CV_TRY_RVV_ZVFH 1
+#  define CV_CPU_FORCE_RVV_ZVFH 1
+#  define CV_CPU_HAS_SUPPORT_RVV_ZVFH 1
+#  define CV_CPU_CALL_RVV_ZVFH(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_RVV_ZVFH_(fn, args) return (opt_RVV_ZVFH::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_RVV_ZVFH
+#  define CV_TRY_RVV_ZVFH 1
+#  define CV_CPU_FORCE_RVV_ZVFH 0
+#  define CV_CPU_HAS_SUPPORT_RVV_ZVFH (cv::checkHardwareSupport(CV_CPU_RVV_ZVFH))
+#  define CV_CPU_CALL_RVV_ZVFH(fn, args) if (CV_CPU_HAS_SUPPORT_RVV_ZVFH) return (opt_RVV_ZVFH::fn args)
+#  define CV_CPU_CALL_RVV_ZVFH_(fn, args) if (CV_CPU_HAS_SUPPORT_RVV_ZVFH) return (opt_RVV_ZVFH::fn args)
+#else
+#  define CV_TRY_RVV_ZVFH 0
+#  define CV_CPU_FORCE_RVV_ZVFH 0
+#  define CV_CPU_HAS_SUPPORT_RVV_ZVFH 0
+#  define CV_CPU_CALL_RVV_ZVFH(fn, args)
+#  define CV_CPU_CALL_RVV_ZVFH_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_RVV_ZVFH(fn, args, mode, ...)  CV_CPU_CALL_RVV_ZVFH(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LSX
 #  define CV_TRY_LSX 1
 #  define CV_CPU_FORCE_LSX 1
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -288,6 +288,7 @@ namespace cv {
 #define CV_CPU_VSX3             201

 #define CV_CPU_RVV              210
+#define CV_CPU_RVV_ZVFH         211

 #define CV_CPU_LSX              230
 #define CV_CPU_LASX             231
@ -350,6 +351,7 @@ enum CpuFeatures {
    CPU_VSX3            = 201,

    CPU_RVV             = 210,
+    CPU_RVV_ZVFH        = 211,

    CPU_LSX             = 230,
    CPU_LASX            = 231,
@ -384,6 +386,8 @@ enum CpuFeatures {
 #if defined __ARM_FP16_FORMAT_IEEE \
    && !defined __CUDACC__
 #  define CV_FP16_TYPE 1
+#elif (defined(__riscv_zvfh) && __riscv_zvfh) || (defined(__riscv_zvfhmin) && __riscv_zvfhmin)
+#  define CV_FP16_TYPE 1
 #else
 #  define CV_FP16_TYPE 0
 #endif
@ -838,12 +842,14 @@ class hfloat
 public:
 #if CV_FP16_TYPE
    hfloat() = default;
-    explicit hfloat(float x) { h = (__fp16)x; }
    operator float() const { return (float)h; }
 #if defined __ARM_FP16_FORMAT_IEEE
+    explicit hfloat(float x) { h = (__fp16)x; }
 protected:
    __fp16 h;
 #else
+    explicit hfloat(float x) { h = (_Float16)x; }
+    explicit operator _Float16() const { return h; }
 protected:
    _Float16 h;
 #endif
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -343,6 +343,10 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD_SCALABLE_64F 0
 #endif

+#ifndef CV_SIMD_SCALABLE_FP16
+#define CV_SIMD_SCALABLE_FP16 0
+#endif
+
 //==================================================================================================

 template<typename _Tp> struct V_RegTraits
@ -412,6 +416,9 @@ template<typename _Tp> struct V_RegTraits
    CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void);
    CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void);
    CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void);
+    #if CV_SIMD_SCALABLE_FP16
+    CV_DEF_REG_TRAITS(v, v_float16, hfloat, f16, v_float16, v_float32, v_float64, v_int16, v_int16);
+    #endif
    CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void);
    CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void);
    CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32);
@ -542,6 +549,7 @@ using namespace CV__SIMD_NAMESPACE;
 #define CV__SIMD_NAMESPACE simd
 namespace CV__SIMD_NAMESPACE {
    #define CV_SIMD 0
+    #define CV_SIMD_FP16 0
    #define CV_SIMD_WIDTH 128  /* 1024/8 */

    #define VXPREFIX(func) v##func
@ -565,7 +573,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
    inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
    inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
    inline v_float16 vx_setall_f16(hfloat v) { return VXPREFIX(_setall_f16)(v); }
 #endif
    inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
@ -585,7 +593,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
    inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
    inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
    inline v_float16 vx_setzero_f16() { return VXPREFIX(_setzero_f16)(); }
 #endif
    inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
@ -605,7 +613,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
    inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
    inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
    inline v_float16 vx_load(const hfloat * ptr) { return VXPREFIX(_load)(ptr); }
 #endif
    inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
@ -625,7 +633,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
    inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
    inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
    inline v_float16 vx_load_aligned(const hfloat * ptr) { return VXPREFIX(_load_aligned)(ptr); }
 #endif
    inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
@ -645,7 +653,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
    inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
    inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
    inline v_float16 vx_load_low(const hfloat * ptr) { return VXPREFIX(_load_low)(ptr); }
 #endif
    inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
@ -665,7 +673,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
    inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
    inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
    inline v_float16 vx_load_halves(const hfloat * ptr0, const hfloat * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
 #endif
    inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
@ -685,7 +693,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
    inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
    inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
    inline v_float16 vx_lut(const hfloat * ptr, const int * idx) { return VXPREFIX(_lut)(ptr, idx); }
 #endif
    inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
@ -705,7 +713,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
    inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
    inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
    inline v_float16 vx_lut_pairs(const hfloat * ptr, const int * idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
 #endif
    inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@ -30,6 +30,12 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN

 #define CV_SIMD_SCALABLE 1
 #define CV_SIMD_SCALABLE_64F 1
+#if defined(__riscv_zvfh) && __riscv_zvfh
+    #define CV_SIMD_SCALABLE_FP16 1
+#else
+    #define CV_SIMD_SCALABLE_FP16 0
+#endif
+

 using v_uint8 = vuint8m1_t;
 using v_int8 = vint8m1_t;
@ -40,6 +46,9 @@ using v_int32 = vint32m1_t;
 using v_uint64 = vuint64m1_t;
 using v_int64 = vint64m1_t;

+#if CV_SIMD_SCALABLE_FP16
+using v_float16 = vfloat16m1_t;
+#endif
 using v_float32 = vfloat32m1_t;
 #if CV_SIMD_SCALABLE_64F
 using v_float64 = vfloat64m1_t;
@ -117,6 +126,13 @@ OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m2_t, uint64_t, e64m2, 64)
 OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m4_t, uint64_t, e64m4, 64)
 OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m8_t, uint64_t, e64m8, 64)

+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat16m1_t, hfloat, e16m1, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat16m2_t, hfloat, e16m2, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat16m4_t, hfloat, e16m4, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat16m8_t, hfloat, e16m8, 16)
+#endif
+
 OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m1_t, float, e32m1, 32)
 OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m2_t, float, e32m2, 32)
 OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m4_t, float, e32m4, 32)
@ -155,6 +171,12 @@ OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int)
 OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64)
 OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64)

+#if CV_SIMD_SCALABLE_FP16
+inline hfloat v_get0(const v_float16& v) \
+{ \
+    return (hfloat)__riscv_vfmv_f(v); \
+}
+#endif
 inline float v_get0(const v_float32& v) \
 { \
    return __riscv_vfmv_f(v); \
@ -197,6 +219,20 @@ inline v_##_Tpv v_setall_##suffix(_Tp v) \
    return __riscv_vfmv_v_f_##suffix##m1(v, vl); \
 }

+#if CV_SIMD_SCALABLE_FP16
+inline v_float16 v_setzero_f16()
+{
+    return __riscv_vfmv_v_f_f16m1(0, VTraits<v_float16>::vlanes());
+}
+inline v_float16 v_setall_f16(float v) // In some cases we may use v_setall_f16(1.0f)
+{
+    return __riscv_vfmv_v_f_f16m1((_Float16)v, VTraits<v_float16>::vlanes());
+}
+inline v_float16 v_setall_f16(hfloat v)
+{
+    return __riscv_vfmv_v_f_f16m1((_Float16)v, VTraits<v_float16>::vlanes());
+}
+#endif
 OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits<v_float32>::vlanes())
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits<v_float64>::vlanes())
@ -216,6 +252,9 @@ OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8)
 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16)
 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32)
 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float16, f16)
+#endif
 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32)
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64)
@ -234,6 +273,10 @@ inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8)
 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16)
 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, float16, u16, f16, u16, f16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, float16, s16, f16, i16, f16)
+#endif
 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32)
 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32)
 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64)
@ -277,6 +320,14 @@ OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64)
 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8)
 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16)
 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float16, u8, f16, u, f, 8, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float16, u32, f16, u, f, 32, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float16, u64, f16, u, f, 64, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float16, s8, f16, i, f, 8, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float16, s32, f16, i, f, 32, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float16, s64, f16, i, f, 64, 16)
+#endif
 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32)
 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32)
 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32)
@ -291,6 +342,17 @@ OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64)
 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64)
 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64)
 // Three times reinterpret
+#if CV_SIMD_SCALABLE_FP16
+inline v_float16 v_reinterpret_as_f16(const v_float64& v) \
+{ \
+    return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_f64m1_u64m1(v)));\
+}
+
+inline v_float64 v_reinterpret_as_f64(const v_float16& v) \
+{ \
+    return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u16m1_u64m1(__riscv_vreinterpret_v_f16m1_u16m1(v)));\
+}
+#endif
 inline v_float32 v_reinterpret_as_f32(const v_float64& v) \
 { \
    return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(v)));\
@ -332,9 +394,12 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
 } \
 template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
 { \
-    return __riscv_vfmv_f(__riscv_vslidedown(v, i, vl)); \
+    return (_Tp)__riscv_vfmv_f(__riscv_vslidedown(v, i, vl)); \
 }

+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float16, hfloat, VTraits<v_float16>::vlanes())
+#endif
 OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32, float, VTraits<v_float32>::vlanes())
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, VTraits<v_float64>::vlanes())
@ -343,7 +408,7 @@ OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, VTraits<v_float64>::vlanes())
 #define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, vl) \
 inline _Tp v_extract_highest(_Tpvec v) \
 { \
-    return v_extract_n(v, vl-1); \
+    return (_Tp)v_extract_n(v, vl-1); \
 }

 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::vlanes())
@ -354,6 +419,9 @@ OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::vlanes())
 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32, int, VTraits<v_int32>::vlanes())
 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::vlanes())
 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64, int64, VTraits<v_int64>::vlanes())
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_float16, hfloat, VTraits<v_float16>::vlanes())
+#endif
 OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32, float, VTraits<v_float32>::vlanes())
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64, double, VTraits<v_float64>::vlanes())
@ -408,6 +476,47 @@ _Tpvec v_load_##suffix(Targs... nScalars) \
    return v_load({nScalars...}); \
 }

+#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP_FP16(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ \
+    return __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr, vl); \
+} \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ \
+    return __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr, vl); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ \
+    __riscv_vse##width##_v_##suffix##m1((_Float16*)ptr, a, vl); \
+} \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+    return __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr, hvl); \
+} \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return __riscv_vslideup(__riscv_vle##width##_v_##suffix##m1((_Float16*)ptr0, hvl), __riscv_vle##width##_v_##suffix##m1((_Float16*)ptr1, hvl), hvl, vl); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __riscv_vse##width((_Float16*)ptr, a, vl); \
+} \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __riscv_vse##width((_Float16*)ptr, a, vl); \
+} \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __riscv_vse##width((_Float16*)ptr, a, vl); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __riscv_vse##width((_Float16*)ptr, a, hvl); \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __riscv_vse##width((_Float16*)ptr, __riscv_vslidedown_vx_##suffix##m1(a, hvl, vl), hvl); \
+}

 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits<v_uint8>::vlanes() / 2, VTraits<v_uint8>::vlanes(), 8, u8)
 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits<v_int8>::vlanes() / 2, VTraits<v_int8>::vlanes(), 8, i8)
@ -417,6 +526,9 @@ OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits<v_
 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m1_t, int, VTraits<v_int32>::vlanes() / 2, VTraits<v_int32>::vlanes(), 32, i32)
 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits<v_uint64>::vlanes() / 2, VTraits<v_uint64>::vlanes(), 64, u64)
 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits<v_int64>::vlanes() / 2, VTraits<v_int64>::vlanes(), 64, i64)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP_FP16(v_float16, vfloat16m1_t, hfloat, VTraits<v_float16>::vlanes() /2 , VTraits<v_float16>::vlanes(), 16, f16)
+#endif
 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits<v_float32>::vlanes() /2 , VTraits<v_float32>::vlanes(), 32, f32)

 #if CV_SIMD_SCALABLE_64F
@ -430,16 +542,25 @@ inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
    auto vidx = __riscv_vmul(__riscv_vreinterpret_u32##suffix(__riscv_vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
    return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
 }
+#define OPENCV_HAL_IMPL_RVV_LUT_FP16(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
+{ \
+    auto vidx = __riscv_vmul(__riscv_vreinterpret_u32##suffix(__riscv_vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    return __riscv_vloxei32((_Float16*)tab, vidx, VTraits<_Tpvec>::vlanes()); \
+}
 OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4)
 OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2)
 OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1)
 OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_LUT_FP16(v_float16, hfloat, m2)
+#endif
 OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2)
 #endif

-#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, suffix1, suffix2, v_trunc) \
+#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, _TpCast, suffix1, suffix2, v_trunc) \
 inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
 { \
    auto v0 = __riscv_vle32_v_u32##suffix1((unsigned*)idx, VTraits<_Tpvec>::vlanes()/2); \
@ -449,19 +570,22 @@ inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
    auto sh1 = __riscv_vslide1up(v_trunc(__riscv_vreinterpret_u32##suffix2(w1)),0, VTraits<_Tpvec>::vlanes()); \
    auto vid = __riscv_vor(sh1, v_trunc(__riscv_vreinterpret_u32##suffix2(w0)), VTraits<_Tpvec>::vlanes()); \
    auto vidx = __riscv_vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
-    return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+    return __riscv_vloxei32((_TpCast *)tab, vidx, VTraits<_Tpvec>::vlanes()); \
 }
-OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, m2, m4, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, m1, m2, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, mf2, m1, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, mf2, m1, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, mf2, m1, __riscv_vlmul_trunc_u32mf2)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, schar, m2, m4, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, short, m1, m2, OPENCV_HAL_NOP)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float16, hfloat, _Float16, m1, m2, OPENCV_HAL_NOP)
+#endif
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, int, mf2, m1, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, float, mf2, m1, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, int64_t, mf2, m1, __riscv_vlmul_trunc_u32mf2)
 #if CV_SIMD_SCALABLE_64F
-OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, mf2, m1, __riscv_vlmul_trunc_u32mf2)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, double, mf2, m1, __riscv_vlmul_trunc_u32mf2)
 #endif


-#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, suffix0, suffix1, suffix2, v_trunc) \
+#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, _TpCast, suffix0, suffix1, suffix2, v_trunc) \
 inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
 { \
    auto v0 = __riscv_vle32_v_u32##suffix0((unsigned*)idx, VTraits<_Tpvec>::vlanes()/4); \
@ -481,12 +605,15 @@ inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
    auto shwid1 = __riscv_vslide1up(__riscv_vreinterpret_u32##suffix2(wid1),0, VTraits<_Tpvec>::vlanes()); \
    auto vid = __riscv_vor(shwid1, __riscv_vreinterpret_u32##suffix2(wid0), VTraits<_Tpvec>::vlanes()); \
    auto vidx = __riscv_vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
-    return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+    return __riscv_vloxei32((_TpCast *)tab, vidx, VTraits<_Tpvec>::vlanes()); \
 }
-OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, m1, m2, m4, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, mf2 , m1, m2, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2)
-OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2)
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, schar, m1, m2, m4, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, short, mf2 , m1, m2, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, int, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float16, hfloat, _Float16, mf2 , m1, m2, OPENCV_HAL_NOP)
+#endif
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, float, mf2, m1, m1, __riscv_vlmul_trunc_u32mf2)

 #define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \
 inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \
@ -557,6 +684,12 @@ OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add, __riscv_vsaddu)
 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub, __riscv_vssubu)
 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add, __riscv_vsadd)
 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub, __riscv_vssub)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float16, add, __riscv_vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float16, sub, __riscv_vfsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float16, mul, __riscv_vfmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float16, div, __riscv_vfdiv)
+#endif
 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, add, __riscv_vadd)
 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, sub, __riscv_vsub)
 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, mul, __riscv_vmul)
@ -602,6 +735,10 @@ OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int64, __riscv_vadd)
 OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint32, __riscv_vmul)
 OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int32, __riscv_vmul)
 OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float32, __riscv_vfmul)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float16, __riscv_vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float16, __riscv_vfmul)
+#endif
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float64, __riscv_vfadd)
 OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float64, __riscv_vfmul)
@ -689,14 +826,30 @@ OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32, VTraits<v_int32>::vlanes())
 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64, VTraits<v_uint64>::vlanes())
 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64, VTraits<v_int64>::vlanes())

-#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(intrin) \
+#if CV_SIMD_SCALABLE_FP16
+#define OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(intrin) \
+inline v_float16 intrin (const v_float16& a, const v_float16& b) \
+{ \
+    return __riscv_vreinterpret_f16m1(intrin(__riscv_vreinterpret_i16m1(a), __riscv_vreinterpret_i16m1(b))); \
+}
+OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(v_and)
+OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(v_or)
+OPENCV_HAL_IMPL_RVV_FLT16_BIT_OP(v_xor)
+
+inline v_float16 v_not (const v_float16& a) \
+{ \
+    return __riscv_vreinterpret_f16m1(v_not(__riscv_vreinterpret_i16m1(a))); \
+}
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(intrin) \
 inline v_float32 intrin (const v_float32& a, const v_float32& b) \
 { \
    return __riscv_vreinterpret_f32m1(intrin(__riscv_vreinterpret_i32m1(a), __riscv_vreinterpret_i32m1(b))); \
 }
-OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_and)
-OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_or)
-OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_xor)
+OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(v_and)
+OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(v_or)
+OPENCV_HAL_IMPL_RVV_FLT32_BIT_OP(v_xor)

 inline v_float32 v_not (const v_float32& a) \
 { \
@ -774,6 +927,18 @@ inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \
    return _Tpvec(res); \
 } //TODO

+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, op, intrin, suffix) \
+inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    size_t VLEN = VTraits<_Tpvec>::vlanes(); \
+    union { uint64_t u; _Float16 d; } ones; \
+    ones.u = -1; \
+    auto diff = intrin(a, b, VLEN); \
+    auto z = __riscv_vfmv_v_f_##suffix##m1(0, VLEN); \
+    auto res = __riscv_vfmerge(z, ones.d, diff, VLEN); \
+    return _Tpvec(res); \
+} //TODO
+
 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix) \
 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, __riscv_vmseq, suffix) \
 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, __riscv_vmsne, suffix) \
@ -798,6 +963,13 @@ OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, gt, __riscv_vmfgt, suffix) \
 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, le, __riscv_vmfle, suffix) \
 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ge, __riscv_vmfge, suffix)

+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_FP16(_Tpvec, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, eq, __riscv_vmfeq, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, ne, __riscv_vmfne, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, lt, __riscv_vmflt, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, gt, __riscv_vmfgt, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, le, __riscv_vmfle, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP_FP16(_Tpvec, ge, __riscv_vmfge, suffix)

 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8, u8)
 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16, u16)
@ -807,11 +979,19 @@ OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8, i8)
 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16, i16)
 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32, i32)
 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64, i64)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_FP16(v_float16, f16)
+#endif
 OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32, f32)
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64, f64)
 #endif

+#if CV_SIMD_SCALABLE_FP16
+inline v_float16 v_not_nan(const v_float16& a)
+{ return v_eq(a, a); }
+#endif
+
 inline v_float32 v_not_nan(const v_float32& a)
 { return v_eq(a, a); }

@ -840,6 +1020,10 @@ OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, __riscv_vminu, VTraits<v_uint32>::
 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, __riscv_vmaxu, VTraits<v_uint32>::vlanes())
 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, __riscv_vmin, VTraits<v_int32>::vlanes())
 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, __riscv_vmax, VTraits<v_int32>::vlanes())
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float16, v_min, __riscv_vfmin, VTraits<v_float16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float16, v_max, __riscv_vfmax, VTraits<v_float16>::vlanes())
+#endif
 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, __riscv_vfmin, VTraits<v_float32>::vlanes())
 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, __riscv_vfmax, VTraits<v_float32>::vlanes())
 #if CV_SIMD_SCALABLE_64F
@ -990,6 +1174,10 @@ OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, max, short, i16, VTraits<v_int16>::vlanes(),
 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, max, unsigned, u32, VTraits<v_uint32>::vlanes(), redmaxu)
 OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, max, int, i32, VTraits<v_int32>::vlanes(), redmax)
 OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, max, float, f32, VTraits<v_float32>::vlanes(), fredmax)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float16, max, hfloat, f16, VTraits<v_float16>::vlanes(), fredmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float16, min, hfloat, f16, VTraits<v_float16>::vlanes(), fredmin)
+#endif

 inline v_float32 v_reduce_sum4(const v_float32& a, const v_float32& b,
                                 const v_float32& c, const v_float32& d)
@ -1043,53 +1231,31 @@ inline v_float32 v_reduce_sum4(const v_float32& a, const v_float32& b,
 }

 ////////////// Square-Root //////////////
-
-inline v_float32 v_sqrt(const v_float32& x)
-{
-    return __riscv_vfsqrt(x, VTraits<v_float32>::vlanes());
+#define OPENCV_HAL_IMPL_RVV_SQR_FP(_Tpvec, _setAllFunc) \
+inline _Tpvec v_sqrt(const _Tpvec& x) \
+{ \
+    return __riscv_vfsqrt(x, VTraits<_Tpvec>::vlanes()); \
+} \
+inline _Tpvec v_invsqrt(const _Tpvec& x) \
+{ \
+    return v_div(_setAllFunc(1.0f), v_sqrt(x)); \
+} \
+inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpvec x = __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<_Tpvec>::vlanes()), b, b, VTraits<_Tpvec>::vlanes()); \
+    return v_sqrt(x); \
+} \
+inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<_Tpvec>::vlanes()), b, b, VTraits<_Tpvec>::vlanes()); \
 }

-inline v_float32 v_invsqrt(const v_float32& x)
-{
-    v_float32 one = v_setall_f32(1.0f);
-    return v_div(one, v_sqrt(x));
-}
-
-#if CV_SIMD_SCALABLE_64F
-inline v_float64 v_sqrt(const v_float64& x)
-{
-    return __riscv_vfsqrt(x, VTraits<v_float64>::vlanes());
-}
-
-inline v_float64 v_invsqrt(const v_float64& x)
-{
-    v_float64 one = v_setall_f64(1.0f);
-    return v_div(one, v_sqrt(x));
-}
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_SQR_FP(v_float16, v_setall_f16)
 #endif
-
-inline v_float32 v_magnitude(const v_float32& a, const v_float32& b)
-{
-    v_float32 x = __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes());
-    return v_sqrt(x);
-}
-
-inline v_float32 v_sqr_magnitude(const v_float32& a, const v_float32& b)
-{
-    return v_float32(__riscv_vfmacc(__riscv_vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes()));
-}
-
+OPENCV_HAL_IMPL_RVV_SQR_FP(v_float32, v_setall_f32)
 #if CV_SIMD_SCALABLE_64F
-inline v_float64 v_magnitude(const v_float64& a, const v_float64& b)
-{
-    v_float64 x = __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
-    return v_sqrt(x);
-}
-
-inline v_float64 v_sqr_magnitude(const v_float64& a, const v_float64& b)
-{
-    return __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
-}
+OPENCV_HAL_IMPL_RVV_SQR_FP(v_float64, v_setall_f64)
 #endif

 ////////////// Multiply-Add //////////////
@ -1113,6 +1279,18 @@ inline v_int32 v_muladd(const v_int32& a, const v_int32& b, const v_int32& c)
    return v_fma(a, b, c);
 }

+#if CV_SIMD_SCALABLE_FP16
+inline v_float16 v_fma(const v_float16& a, const v_float16& b, const v_float16& c)
+{
+    return __riscv_vfmacc(c, a, b, VTraits<v_float16>::vlanes());
+}
+
+inline v_float16 v_muladd(const v_float16& a, const v_float16& b, const v_float16& c)
+{
+    return v_fma(a, b, c);
+}
+#endif
+
 #if CV_SIMD_SCALABLE_64F
 inline v_float64 v_fma(const v_float64& a, const v_float64& b, const v_float64& c)
 {
@ -1153,6 +1331,13 @@ inline bool v_check_all(const v_uint16& a)
 inline bool v_check_any(const v_uint16& a)
 { return v_check_any(v_reinterpret_as_s16(a)); }

+#if CV_SIMD_SCALABLE_FP16
+inline bool v_check_all(const v_float16& a)
+{ return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_float16& a)
+{ return v_check_any(v_reinterpret_as_s16(a)); }
+#endif
+
 inline bool v_check_all(const v_uint32& a)
 { return v_check_all(v_reinterpret_as_s32(a)); }
 inline bool v_check_any(const v_uint32& a)
@ -1186,6 +1371,9 @@ inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8, absdiff)
 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16, absdiff)
 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32, absdiff)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float16, absdiff)
+#endif
 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32, absdiff)
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64, absdiff)
@ -1212,6 +1400,9 @@ inline _Tprvec v_abs(const _Tpvec& a) \
 OPENCV_HAL_IMPL_RVV_ABS(v_uint8, v_int8, s8)
 OPENCV_HAL_IMPL_RVV_ABS(v_uint16, v_int16, s16)
 OPENCV_HAL_IMPL_RVV_ABS(v_uint32, v_int32, s32)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_ABS(v_float16, v_float16, f16)
+#endif
 OPENCV_HAL_IMPL_RVV_ABS(v_float32, v_float32, f32)
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_ABS(v_float64, v_float64, f64)
@ -1246,6 +1437,12 @@ OPENCV_HAL_IMPL_RVV_SELECT(v_uint32, VTraits<v_uint32>::vlanes())
 OPENCV_HAL_IMPL_RVV_SELECT(v_int8, VTraits<v_int8>::vlanes())
 OPENCV_HAL_IMPL_RVV_SELECT(v_int16, VTraits<v_int16>::vlanes())
 OPENCV_HAL_IMPL_RVV_SELECT(v_int32, VTraits<v_int32>::vlanes())
+#if CV_SIMD_SCALABLE_FP16
+inline v_float16 v_select(const v_float16& mask, const v_float16& a, const v_float16& b) \
+{ \
+    return __riscv_vmerge(b, a, __riscv_vmfne(mask, 0, VTraits<v_float16>::vlanes()), VTraits<v_float16>::vlanes()); \
+}
+#endif

 inline v_float32 v_select(const v_float32& mask, const v_float32& a, const v_float32& b) \
 { \
@ -1314,12 +1511,39 @@ template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
 { CV_UNUSED(b); return a; }

+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float16, f16, VTraits<v_float16>::vlanes())
+#endif
 OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32, f32, VTraits<v_float32>::vlanes())
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64, f64,  VTraits<v_float64>::vlanes())
 #endif

 ////////////// Convert to float //////////////
+
+#if CV_SIMD_SCALABLE_FP16
+inline v_float16 v_cvt_f16(const v_float32 &a)
+{
+    return __riscv_vfncvt_f(__riscv_vlmul_ext_f32m2(a), VTraits<v_float32>::vlanes());
+}
+inline v_float16 v_cvt_f16(const v_float32 &a, const v_float32 &b)
+{
+    return __riscv_vfncvt_f(__riscv_vset(__riscv_vlmul_ext_f32m2(a),1,b), VTraits<v_float16>::vlanes());
+}
+inline v_float16 v_cvt_f16(const v_int16 &a)
+{
+    return __riscv_vfcvt_f(a, VTraits<v_float16>::vlanes());
+}
+inline v_float32 v_cvt_f32(const v_float16 &a)
+{
+    return __riscv_vget_f32m1(__riscv_vfwcvt_f(a, VTraits<v_float16>::vlanes()), 0);
+}
+inline v_float32 v_cvt_f32_high(const v_float16 &a)
+{
+    return __riscv_vget_f32m1(__riscv_vfwcvt_f(a, VTraits<v_float16>::vlanes()), 1);
+}
+#endif
+
 inline v_float32 v_cvt_f32(const v_int32& a)
 {
    return __riscv_vfcvt_f_x_v_f32m1(a, VTraits<v_float32>::vlanes());
@ -1367,13 +1591,16 @@ inline v_float64 v_cvt_f64(const v_int64& a)
 #define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
 template<int s = 0> inline _Tpvec v_broadcast_element(_Tpvec v, int i = s) \
 { \
-    return v_setall_##suffix(v_extract_n(v, i)); \
+    return v_setall_##suffix((_Float16)v_extract_n(v, i)); \
 } \
 inline _Tpvec v_broadcast_highest(_Tpvec v) \
 { \
-    return v_setall_##suffix(v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \
+    return v_setall_##suffix((_Float16)v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \
 }

+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_float16, f16)
+#endif
 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32, u32)
 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32, s32)
 OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32, f32)
@ -1390,6 +1617,9 @@ OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8, 8)
 OPENCV_HAL_IMPL_RVV_REVERSE(v_int8, 8)
 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16, 16)
 OPENCV_HAL_IMPL_RVV_REVERSE(v_int16, 16)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float16, 16)
+#endif
 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32, 32)
 OPENCV_HAL_IMPL_RVV_REVERSE(v_int32, 32)
 OPENCV_HAL_IMPL_RVV_REVERSE(v_float32, 32)
@ -1531,6 +1761,9 @@ OPENCV_HAL_IMPL_RVV_ZIP(v_uint8, vuint8m2_t, u8, 8, 16, OPENCV_HAL_NOP, OPENCV_H
 OPENCV_HAL_IMPL_RVV_ZIP(v_int8, vint8m2_t, i8, 8, 16, __riscv_vreinterpret_u8m2, __riscv_vreinterpret_u8m1)
 OPENCV_HAL_IMPL_RVV_ZIP(v_uint16, vuint16m2_t, u16, 16, 32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_RVV_ZIP(v_int16, vint16m2_t, i16, 16, 32, __riscv_vreinterpret_u16m2, __riscv_vreinterpret_u16m1)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_ZIP(v_float16, vfloat16m2_t, f16, 16, 32, __riscv_vreinterpret_u16m2, __riscv_vreinterpret_u16m1)
+#endif
 OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, __riscv_vreinterpret_u32m2, __riscv_vreinterpret_u32m1)
 OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, __riscv_vreinterpret_u32m2, __riscv_vreinterpret_u32m1)
@ -1580,66 +1813,72 @@ OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint16, 16)
 OPENCV_HAL_IMPL_RVV_UNPACKS(v_int16, 16)
 OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint32, 32)
 OPENCV_HAL_IMPL_RVV_UNPACKS(v_int32, 32)
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_float16, 16)
+#endif
 OPENCV_HAL_IMPL_RVV_UNPACKS(v_float32, 32)
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_UNPACKS(v_float64, 64)
 #endif

-#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width, hwidth, vl) \
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, _TpCast, suffix, width, hwidth, vl) \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
 { \
-    a = __riscv_vlse##width##_v_##suffix##m1(ptr  , sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
-    b = __riscv_vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
+    a = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)ptr  , sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
+    b = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+1), sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
 }\
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
 { \
-    a = __riscv_vlse##width##_v_##suffix##m1(ptr  , sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
-    b = __riscv_vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
-    c = __riscv_vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
+    a = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)ptr  , sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
+    b = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+1), sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
+    c = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+2), sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
 } \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
                                v_##_Tpvec& c, v_##_Tpvec& d) \
 { \
    \
-    a = __riscv_vlse##width##_v_##suffix##m1(ptr  , sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
-    b = __riscv_vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
-    c = __riscv_vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
-    d = __riscv_vlse##width##_v_##suffix##m1(ptr+3, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
+    a = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)ptr  , sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
+    b = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+1), sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
+    c = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+2), sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
+    d = __riscv_vlse##width##_v_##suffix##m1((_TpCast *)(ptr+3), sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
-    __riscv_vsse##width(ptr, sizeof(_Tp)*2, a, VTraits<v_##_Tpvec>::vlanes()); \
-    __riscv_vsse##width(ptr+1, sizeof(_Tp)*2, b, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width((_TpCast *)ptr, sizeof(_Tp)*2, a, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width((_TpCast *)(ptr+1), sizeof(_Tp)*2, b, VTraits<v_##_Tpvec>::vlanes()); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
-    __riscv_vsse##width(ptr, sizeof(_Tp)*3, a, VTraits<v_##_Tpvec>::vlanes()); \
-    __riscv_vsse##width(ptr+1, sizeof(_Tp)*3, b, VTraits<v_##_Tpvec>::vlanes()); \
-    __riscv_vsse##width(ptr+2, sizeof(_Tp)*3, c, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width((_TpCast *)ptr, sizeof(_Tp)*3, a, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width((_TpCast *)(ptr+1), sizeof(_Tp)*3, b, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width((_TpCast *)(ptr+2), sizeof(_Tp)*3, c, VTraits<v_##_Tpvec>::vlanes()); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
 { \
-    __riscv_vsse##width(ptr, sizeof(_Tp)*4, a, VTraits<v_##_Tpvec>::vlanes()); \
-    __riscv_vsse##width(ptr+1, sizeof(_Tp)*4, b, VTraits<v_##_Tpvec>::vlanes()); \
-    __riscv_vsse##width(ptr+2, sizeof(_Tp)*4, c, VTraits<v_##_Tpvec>::vlanes()); \
-    __riscv_vsse##width(ptr+3, sizeof(_Tp)*4, d, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width((_TpCast *)ptr, sizeof(_Tp)*4, a, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width((_TpCast *)(ptr+1), sizeof(_Tp)*4, b, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width((_TpCast *)(ptr+2), sizeof(_Tp)*4, c, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width((_TpCast *)(ptr+3), sizeof(_Tp)*4, d, VTraits<v_##_Tpvec>::vlanes()); \
 }

-OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8, uchar, u8, 8, 4, VTraits<v_uint8>::vlanes())
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8, schar, i8, 8, 4, VTraits<v_int8>::vlanes())
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16, ushort, u16, 16, 8, VTraits<v_uint16>::vlanes())
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16, short, i16, 16, 8, VTraits<v_int16>::vlanes())
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32, unsigned, u32, 32, 16, VTraits<v_uint32>::vlanes())
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32, int, i32, 32, 16, VTraits<v_int32>::vlanes())
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32, float, f32, 32, 16, VTraits<v_float32>::vlanes())
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64, uint64, u64, 64, 32, VTraits<v_uint64>::vlanes())
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64, int64, i64, 64, 32, VTraits<v_int64>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8, uchar, uchar, u8, 8, 4, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8, schar, schar, i8, 8, 4, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16, ushort, ushort, u16, 16, 8, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16, short, short, i16, 16, 8, VTraits<v_int16>::vlanes())
+#if CV_SIMD_SCALABLE_FP16
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float16, hfloat, _Float16, f16, 16, 8, VTraits<v_float16>::vlanes())
+#endif
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32, unsigned, unsigned, u32, 32, 16, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32, int, int, i32, 32, 16, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32, float, float, f32, 32, 16, VTraits<v_float32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64, uint64, uint64, u64, 64, 32, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64, int64, int64, i64, 64, 32, VTraits<v_int64>::vlanes())
 #if CV_SIMD_SCALABLE_64F
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64, double, f64, 64, 32, VTraits<v_float64>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64, double, double, f64, 64, 32, VTraits<v_float64>::vlanes())
 #endif

 static uint64_t idx_interleave_pairs[] = { \
@ -1781,6 +2020,10 @@ inline int64 v_signmask(const v_uint8& a)
 { return v_signmask(v_reinterpret_as_s8(a)); }
 inline int64 v_signmask(const v_uint16& a)
 { return v_signmask(v_reinterpret_as_s16(a)); }
+#if CV_SIMD_SCALABLE_FP16
+inline int v_signmask(const v_float16& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+#endif
 inline int v_signmask(const v_uint32& a)
 { return v_signmask(v_reinterpret_as_s32(a)); }
 inline int v_signmask(const v_float32& a)
@ -1862,6 +2105,35 @@ inline void v_pack_store(hfloat* ptr, const v_float32& v)
 }
 #endif
 ////////////// Rounding //////////////
+#if CV_SIMD_SCALABLE_FP16
+inline v_int16 v_round(const v_float16& a)
+{
+    return __riscv_vfcvt_x(a, VTraits<v_float16>::vlanes());
+}
+
+inline v_int16 v_floor(const v_float16& a)
+{
+#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999
+    return __riscv_vfcvt_x_f_v_i16m1_rm(a, 1 /*RNE, round-to-nearest-even*/, VTraits<v_float16>::vlanes());
+#else
+    return __riscv_vfcvt_x(vfsub(a, 0.5f - 1e-5, VTraits<v_float16>::vlanes()), VTraits<v_float16>::vlanes());
+#endif
+}
+
+inline v_int16 v_ceil(const v_float16& a)
+{
+#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999
+    return __riscv_vfcvt_x_f_v_i16m1_rm(a, 3 /*ROD, round-to-odd*/, VTraits<v_float16>::vlanes());
+#else
+    return __riscv_vfcvt_x(vfadd(a, 0.5f - 1e-5, VTraits<v_float16>::vlanes()), VTraits<v_float16>::vlanes());
+#endif
+}
+
+inline v_int16 v_trunc(const v_float16& a)
+{
+    return __riscv_vfcvt_rtz_x(a, VTraits<v_float16>::vlanes());
+}
+#endif
 inline v_int32 v_round(const v_float32& a)
 {
    // return vfcvt_x(vfadd(a, 1e-6, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
@ -2155,6 +2427,41 @@ inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b, const
 #endif

 // TODO: only 128 bit now.
+#if CV_SIMD_SCALABLE_FP16
+inline v_float16 v_matmul(  const v_float16 &v,
+                            const v_float16 &m0, const v_float16 &m1,
+                            const v_float16 &m2, const v_float16 &m3,
+                            const v_float16 &m4, const v_float16 &m5,
+                            const v_float16 &m6, const v_float16 &m7) {
+    vfloat16m1_t res;
+    res = __riscv_vfmul_vf_f16m1(m0, (_Float16)v_extract_n(v, 0), VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 1), m1, VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 2), m2, VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 3), m3, VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 4), m4, VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 5), m5, VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 6), m6, VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 7), m7, VTraits<v_float16>::vlanes());
+    return res;
+}
+inline v_float16 v_matmuladd(  const v_float16 &v,
+                               const v_float16 &m0, const v_float16 &m1,
+                               const v_float16 &m2, const v_float16 &m3,
+                               const v_float16 &m4, const v_float16 &m5,
+                               const v_float16 &m6,
+                               const v_float16 &a) {
+    vfloat16m1_t res;
+    res = __riscv_vfmul_vf_f16m1(m0, (_Float16)v_extract_n(v, 0), VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 1), m1, VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 2), m2, VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 3), m3, VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 4), m4, VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 5), m5, VTraits<v_float16>::vlanes());
+    res = __riscv_vfmacc_vf_f16m1(res, (_Float16)v_extract_n(v, 6), m6, VTraits<v_float16>::vlanes());
+    return  __riscv_vfadd(res, a, VTraits<v_float16>::vlanes());
+}
+#endif
+
 inline v_float32 v_matmul(const v_float32& v, const v_float32& m0,
                            const v_float32& m1, const v_float32& m2,
                            const v_float32& m3)
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -434,6 +434,7 @@ struct HWFeatures
        g_hwFeatureNames[CPU_AVX512_ICL] = "AVX512-ICL";

        g_hwFeatureNames[CPU_RVV] = "RVV";
+        g_hwFeatureNames[CPU_RVV_ZVFH] = "RVV_ZVFH";

        g_hwFeatureNames[CPU_LSX]  = "LSX";
        g_hwFeatureNames[CPU_LASX] = "LASX";
@ -712,6 +713,12 @@ struct HWFeatures

    #if defined __riscv && defined __riscv_vector
        have[CV_CPU_RVV] = true;
+        #if (defined __riscv_zvfh && __riscv_zvfh) || (defined __riscv_zvfhmin && __riscv_zvfhmin)
+            have[CV_CPU_FP16] = true;
+        #endif
+        #if defined __riscv_zvfh && __riscv_zvfh
+            have[CV_CPU_RVV_ZVFH] = true;
+        #endif
    #endif

    #if defined __loongarch64 && defined __linux__
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -159,8 +159,8 @@ template <typename R> std::ostream & operator<<(std::ostream & out, const Data<R
    out << "{ ";
    for (int i = 0; i < VTraits<R>::vlanes(); ++i)
    {
-        // out << std::hex << +V_TypeTraits<typename VTraits<R>::lane_type>::reinterpret_int(d.d[i]);
-        out << +d.d[i];
+        out << std::hex << +V_TypeTraits<typename VTraits<R>::lane_type>::reinterpret_int(d.d[i]);
+        // out << +d.d[i]; // Note: No  operator '<<' for _Float16
        if (i + 1 < VTraits<R>::vlanes())
            out << ", ";
    }
@ -182,7 +182,7 @@ template<> inline void EXPECT_COMPARE_EQ_<double>(const double a, const double b
    EXPECT_DOUBLE_EQ( a, b );
 }

-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
 template<> inline void EXPECT_COMPARE_EQ_<hfloat>(const hfloat a, const hfloat b)
 {
    EXPECT_LT(std::abs(float(a - b)), 0.126);
@ -564,7 +564,7 @@ template<typename R> struct TheTest
    // Handle accuracy for fp16
    TheTest & test_div_fp16()
    {
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
        Data<R> dataA, dataB;
        dataB.reverse();
        R a = dataA, b = dataB;
@ -1572,7 +1572,7 @@ template<typename R> struct TheTest

    TheTest & test_matmul_fp16()
    {
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
        Data<R> dataV, data0, data1, data2, data3, data4, data5, data6, data7;
        data1.reverse();
        data2 += 2;
@ -1657,7 +1657,8 @@ template<typename R> struct TheTest

    TheTest & test_transpose8x8_fp16()
    {
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 /*|| CV_SIMD_SCALABLE_FP16*/)
+// Note: The scalable backend does not yet implement fixed-length functions
        Data<R> dataA0, dataA1, dataA2, dataA3, dataA4, dataA5, dataA6, dataA7;
        dataA1 *= 2;
        dataA2 *= 4;
@ -1713,7 +1714,8 @@ template<typename R> struct TheTest

    TheTest & test_reduce_sum8()
    {
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 /*|| CV_SIMD_SCALABLE_FP16*/)
+// Note: The scalable backend does not yet implement fixed-length functions
        Data<R> dataA, dataB, dataC, dataD, dataW, dataX, dataY, dataZ;
        dataB *= 0.01f;
        dataC *= 0.001f;
@ -1773,7 +1775,7 @@ template<typename R> struct TheTest

    TheTest & test_loadstore_fp16()
    {
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
        AlignedData<R> data;
        AlignedData<R> out;

@ -1804,7 +1806,7 @@ template<typename R> struct TheTest

    TheTest & test_float_cvt_fp16()
    {
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
        AlignedData<v_float32> data;

        // check conversion
@ -2449,7 +2451,7 @@ void test_hal_intrin_float16()
    DUMP_ENTRY(v_float16);
 #if CV_FP16
    TheTest<v_float32>().test_loadstore_fp16_f32();
-#if CV_SIMD_FP16
+#if (CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16)
    TheTest<v_float16>()
        .test_loadstore_fp16()
        .test_float_cvt_fp16()
@ -2476,6 +2478,8 @@ void test_hal_intrin_float16()
        .test_extract_n<0>().test_extract_n<1>()
        .test_exp_fp16()
        .test_log_fp16()
+#else
+    std::cout << "SKIP: CV_SIMD_FP16 || CV_SIMD_SCALABLE_FP16 is not available" << std::endl;
 #endif
        ;
 #else