diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
index ff57d39041..fa5c7f280d 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@@ -543,6 +543,40 @@ OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m2_t, float, VTraits<v_float
 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m2_t, double, VTraits<v_float64>::vlanes() / 2, VTraits<v_float64>::vlanes(), 64, f64)
 #endif
 
+template <int N = VTraits<v_uint16>::max_nlanes>
+inline void v_store(ushort* ptr, const v_uint16& a)
+{
+    ushort buf[VTraits<v_uint16>::max_nlanes];
+    v_store(buf, a);
+    for (int i = 0; i < N; i++) {
+        ptr[i] = buf[i];
+    }
+}
+template <> inline void v_store<8>(ushort* ptr, const v_uint16& a)
+{
+    ushort buf[VTraits<v_uint16>::max_nlanes];
+    v_store(buf, a);
+    ptr[0] = buf[0]; ptr[1] = buf[1]; ptr[2] = buf[2]; ptr[3] = buf[3];
+    ptr[4] = buf[4]; ptr[5] = buf[5]; ptr[6] = buf[6]; ptr[7] = buf[7];
+}
+
+template <int N = VTraits<v_float32>::max_nlanes>
+inline void v_store(float* ptr, const v_float32& a)
+{
+    float buf[VTraits<v_float32>::max_nlanes];
+    v_store(buf, a);
+    for (int i = 0; i < N; i++) {
+        ptr[i] = buf[i];
+    }
+}
+template <> inline void v_store<4>(float* ptr, const v_float32& a)
+{
+    float buf[VTraits<v_float32>::max_nlanes];
+    v_store(buf, a);
+    ptr[0] = buf[0]; ptr[1] = buf[1];
+    ptr[2] = buf[2]; ptr[3] = buf[3];
+}
+
 ////////////// Lookup table access ////////////////////
 #define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
 inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
@@ -1616,6 +1650,42 @@ OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m4_t, v_int16, 16, i32, i16, __
 OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m4_t, v_uint32, 32, u64, u32, __riscv_vwcvtu_x)
 OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m4_t, v_int32, 32, i64, i32, __riscv_vwcvt_x)
 
+template <int N = VTraits<v_float32>::max_nlanes>
+inline v_float32 v_load(const float* ptr)
+{
+    float buf[VTraits<v_float32>::max_nlanes];
+    v_store(buf, v_setzero_f32());
+    for (int i = 0; i < N; i++) {
+        buf[i] = ptr[i];
+    }
+    return v_load(buf);
+}
+template <> inline v_float32 v_load<4>(const float* ptr)
+{
+    float buf[VTraits<v_float32>::max_nlanes];
+    v_store(buf, v_setzero_f32());
+    buf[0] = ptr[0]; buf[1] = ptr[1]; buf[2] = ptr[2]; buf[3] = ptr[3];
+    return v_load(buf);
+}
+
+template <int N = VTraits<v_uint32>::max_nlanes>
+inline v_uint32 v_load_expand(const ushort* ptr)
+{
+    ushort buf[VTraits<v_uint16>::max_nlanes];
+    v_store(buf, v_setzero_u16());
+    for (int i = 0; i < N; i++) {
+        buf[i] = ptr[i];
+    }
+    return v_load_expand(buf);
+}
+template <> inline v_uint32 v_load_expand<4>(const ushort* ptr)
+{
+    ushort buf[VTraits<v_uint16>::max_nlanes];
+    v_store(buf, v_setzero_u16());
+    buf[0] = ptr[0]; buf[1] = ptr[1]; buf[2] = ptr[2]; buf[3] = ptr[3];
+    return v_load_expand(buf);
+}
+
 inline v_uint32 v_load_expand_q(const uchar* ptr)
 {
     return __riscv_vwcvtu_x(__riscv_vwcvtu_x(__riscv_vle8_v_u8mf2(ptr, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
@@ -1627,16 +1697,16 @@ inline v_int32 v_load_expand_q(const schar* ptr)
 }
 
 template <int N = VTraits<v_uint32>::max_nlanes>
-inline v_uint32 v_load_expand_q(const uchar* ptr, int n = N)
+inline v_uint32 v_load_expand_q(const uchar* ptr)
 {
     uchar buf[VTraits<v_uint8>::max_nlanes];
     v_store(buf, v_setzero_u8());
-    for (int i = 0; i < n; i++) {
+    for (int i = 0; i < N; i++) {
         buf[i] = ptr[i];
     }
     return v_load_expand_q(buf);
 }
-template <> inline v_uint32 v_load_expand_q<4>(const uchar* ptr, int n)
+template <> inline v_uint32 v_load_expand_q<4>(const uchar* ptr)
 {
     uchar buf[VTraits<v_uint8>::max_nlanes];
     v_store(buf, v_setzero_u8());
@@ -1714,19 +1784,48 @@ void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a, int n = N) \
 OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8, uchar, v_int16, short, 8, 16, u8, i16, __riscv_vreinterpret_v_i16m4_u16m4, VTraits<v_int16>::vlanes(), VTraits<v_uint8>::vlanes())
 OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16, ushort, v_int32, int, 16, 32, u16, i32,  __riscv_vreinterpret_v_i32m4_u32m4, VTraits<v_int32>::vlanes(), VTraits<v_uint16>::vlanes())
 
+template <int N = VTraits<v_uint16>::max_nlanes>
+inline v_uint16 v_pack_u(const v_int32& a, const v_int32& b)
+{
+    ushort bufa[N];
+    ushort bufb[N];
+    v_pack_u_store(bufa, a);
+    v_pack_u_store(bufb, b);
+    ushort buf[N];
+    for (int i = 0; i < N; i++) {
+        buf[i] = bufa[i];
+        buf[i+N/2] = bufb[i];
+    }
+    return v_load(buf);
+}
+
+template <> inline v_uint16 v_pack_u<4>(const v_int32& a, const v_int32& b)
+{
+    constexpr int N = VTraits<v_uint16>::max_nlanes;
+    ushort bufa[N];
+    ushort bufb[N];
+    v_pack_u_store(bufa, a);
+    v_pack_u_store(bufb, b);
+
+    ushort buf[N];
+    buf[0] = bufa[0]; buf[1] = bufa[1]; buf[2] = bufa[2]; buf[3] = bufa[3];
+    buf[4] = bufb[0]; buf[5] = bufb[1]; buf[6] = bufb[2]; buf[7] = bufb[3];
+    return v_load(buf);
+}
+
 template <int N = VTraits<v_int16>::max_nlanes>
-inline void v_pack_u_store(uchar* ptr, const v_int16& a, int n = N)
+inline void v_pack_store(uchar* ptr, const v_uint16& a)
 {
     uchar buf[VTraits<v_uint8>::max_nlanes];
-    v_pack_u_store(buf, a);
-    for (int i = 0; i < n; i++) {
+    v_pack_store(buf, a);
+    for (int i = 0; i < N; i++) {
         ptr[i] = buf[i];
     }
 }
-template <> inline void v_pack_u_store<8>(uchar* ptr, const v_int16& a, int n)
+template <> inline void v_pack_store<8>(uchar* ptr, const v_uint16& a)
 {
     uchar buf[VTraits<v_uint8>::max_nlanes];
-    v_pack_u_store(buf, a);
+    v_pack_store(buf, a);
     ptr[0] = buf[0]; ptr[1] = buf[1]; ptr[2] = buf[2]; ptr[3] = buf[3];
     ptr[4] = buf[4]; ptr[5] = buf[5]; ptr[6] = buf[6]; ptr[7] = buf[7];
 }
diff --git a/modules/imgproc/src/warp_common.vector.hpp b/modules/imgproc/src/warp_common.vector.hpp
index 6405ee6aab..1e14ae20d9 100644
--- a/modules/imgproc/src/warp_common.vector.hpp
+++ b/modules/imgproc/src/warp_common.vector.hpp
@@ -569,16 +569,50 @@
     i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \
     i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \
     i##ofs##_pix0 = v_fma(i##ofs##_beta,  v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0);
-#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4() \
+#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_16UC4_I(ofs) \
+    const uint16_t *srcptr##ofs = src + addr[i+ofs]; \
+    v_float32 i##ofs##_pix0 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs))); \
+    v_float32 i##ofs##_pix1 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs+4))); \
+    v_float32 i##ofs##_pix2 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs+srcstep))); \
+    v_float32 i##ofs##_pix3 = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(srcptr##ofs+srcstep+4))); \
+    v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \
+              i##ofs##_beta  = vx_setall_f32(vbeta[i+ofs]);  \
+    i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \
+    i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \
+    i##ofs##_pix0 = v_fma(i##ofs##_beta,  v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0);
+#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_32FC4_I(ofs) \
+    const float *srcptr##ofs = src + addr[i+ofs]; \
+    v_float32 i##ofs##_pix0 = vx_load(srcptr##ofs); \
+    v_float32 i##ofs##_pix1 = vx_load(srcptr##ofs+4); \
+    v_float32 i##ofs##_pix2 = vx_load(srcptr##ofs+srcstep); \
+    v_float32 i##ofs##_pix3 = vx_load(srcptr##ofs+srcstep+4); \
+    v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \
+              i##ofs##_beta  = vx_setall_f32(vbeta[i+ofs]);  \
+    i##ofs##_pix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix1, i##ofs##_pix0), i##ofs##_pix0); \
+    i##ofs##_pix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_pix3, i##ofs##_pix2), i##ofs##_pix2); \
+    i##ofs##_pix0 = v_fma(i##ofs##_beta,  v_sub(i##ofs##_pix2, i##ofs##_pix0), i##ofs##_pix0);
+#define CV_WARP_SIMD128_STORE_8UC4_I() \
+    v_uint16 i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)); \
+    v_uint16 i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \
+    v_pack_store(dstptr + 4*(x+i),   i01_pix); \
+    v_pack_store(dstptr + 4*(x+i+2), i23_pix);
+#define CV_WARP_SIMD128_STORE_16UC4_I() \
+    v_uint16 i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)); \
+    v_uint16 i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \
+    vx_store(dstptr + 4*(x+i),   i01_pix); \
+    vx_store(dstptr + 4*(x+i+2), i23_pix);
+#define CV_WARP_SIMD128_STORE_32FC4_I() \
+    vx_store(dstptr + 4*(x+i),    i0_pix0); \
+    vx_store(dstptr + 4*(x+i)+4,  i1_pix0); \
+    vx_store(dstptr + 4*(x+i)+8,  i2_pix0); \
+    vx_store(dstptr + 4*(x+i)+12, i3_pix0);
+#define CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(DEPTH) \
     for (int i = 0; i < uf; i+=vlanes_32) { \
-        CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(0); \
-        CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(1); \
-        CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(2); \
-        CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4_I(3); \
-        auto i01_pix = v_pack_u(v_round(i0_pix0), v_round(i1_pix0)), \
-             i23_pix = v_pack_u(v_round(i2_pix0), v_round(i3_pix0)); \
-        v_pack_store(dstptr + 4*(x+i), i01_pix); \
-        v_pack_store(dstptr + 4*(x+i+2), i23_pix); \
+        CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0); \
+        CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(1); \
+        CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2); \
+        CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(3); \
+        CV_WARP_SIMD128_STORE_##DEPTH##C4_I(); \
     }
 #define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(ofs0, ofs1) \
     const uint8_t *srcptr##ofs0 = src + addr[i+ofs0]; \
@@ -602,16 +636,70 @@
     i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \
     i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \
     i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta,  v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00);
-#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4() \
+#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_16UC4_I(ofs0, ofs1) \
+    const uint16_t *srcptr##ofs0 = src + addr[i+ofs0]; \
+    const uint16_t *srcptr##ofs1 = src + addr[i+ofs1]; \
+    v_int32 i##ofs0##_pix01 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs0)), \
+            i##ofs0##_pix23 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs0+srcstep)); \
+    v_int32 i##ofs1##_pix01 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs1)), \
+            i##ofs1##_pix23 = v_reinterpret_as_s32(v256_load_expand(srcptr##ofs1+srcstep)); \
+    v_float32 i##ofs0##_fpix01 = v_cvt_f32(i##ofs0##_pix01), i##ofs0##_fpix23 = v_cvt_f32(i##ofs0##_pix23); \
+    v_float32 i##ofs1##_fpix01 = v_cvt_f32(i##ofs1##_pix01), i##ofs1##_fpix23 = v_cvt_f32(i##ofs1##_pix23); \
+    v_float32 i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11, \
+              i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33; \
+    v_recombine(i##ofs0##_fpix01, i##ofs1##_fpix01, i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11); \
+    v_recombine(i##ofs0##_fpix23, i##ofs1##_fpix23, i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33); \
+    v_float32 i##ofs0##_alpha = vx_setall_f32(valpha[i+ofs0]), \
+              i##ofs1##_alpha = vx_setall_f32(valpha[i+ofs1]), \
+              i##ofs0##_beta  = vx_setall_f32(vbeta[i+ofs0]), \
+              i##ofs1##_beta  = vx_setall_f32(vbeta[i+ofs1]); \
+    v_float32 i##ofs0##ofs1##_alpha = v_combine_low(i##ofs0##_alpha, i##ofs1##_alpha), \
+              i##ofs0##ofs1##_beta  = v_combine_low(i##ofs0##_beta,  i##ofs1##_beta); \
+    i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \
+    i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \
+    i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta,  v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00);
+#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_32FC4_I(ofs0, ofs1) \
+    const float *srcptr##ofs0 = src + addr[i+ofs0]; \
+    const float *srcptr##ofs1 = src + addr[i+ofs1]; \
+    v_float32 i##ofs0##_fpix01 = v256_load(srcptr##ofs0), \
+              i##ofs0##_fpix23 = v256_load(srcptr##ofs0+srcstep); \
+    v_float32 i##ofs1##_fpix01 = v256_load(srcptr##ofs1), \
+              i##ofs1##_fpix23 = v256_load(srcptr##ofs1+srcstep); \
+    v_float32 i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11, \
+              i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33; \
+    v_recombine(i##ofs0##_fpix01, i##ofs1##_fpix01, i##ofs0##ofs1##_fpix00, i##ofs0##ofs1##_fpix11); \
+    v_recombine(i##ofs0##_fpix23, i##ofs1##_fpix23, i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix33); \
+    v_float32 i##ofs0##_alpha = vx_setall_f32(valpha[i+ofs0]), \
+              i##ofs1##_alpha = vx_setall_f32(valpha[i+ofs1]), \
+              i##ofs0##_beta  = vx_setall_f32(vbeta[i+ofs0]), \
+              i##ofs1##_beta  = vx_setall_f32(vbeta[i+ofs1]); \
+    v_float32 i##ofs0##ofs1##_alpha = v_combine_low(i##ofs0##_alpha, i##ofs1##_alpha), \
+              i##ofs0##ofs1##_beta  = v_combine_low(i##ofs0##_beta,  i##ofs1##_beta); \
+    i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix11, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00); \
+    i##ofs0##ofs1##_fpix22 = v_fma(i##ofs0##ofs1##_alpha, v_sub(i##ofs0##ofs1##_fpix33, i##ofs0##ofs1##_fpix22), i##ofs0##ofs1##_fpix22); \
+    i##ofs0##ofs1##_fpix00 = v_fma(i##ofs0##ofs1##_beta,  v_sub(i##ofs0##ofs1##_fpix22, i##ofs0##ofs1##_fpix00), i##ofs0##ofs1##_fpix00);
+#define CV_WARP_SIMD256_STORE_8UC4_I() \
+    auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \
+    v_pack_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \
+    auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \
+    v_pack_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix));
+#define CV_WARP_SIMD256_STORE_16UC4_I() \
+    auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \
+    vx_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \
+    auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \
+    vx_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix));
+#define CV_WARP_SIMD256_STORE_32FC4_I() \
+    vx_store(dstptr + 4*(x+i),    i01_fpix00); \
+    vx_store(dstptr + 4*(x+i)+8,  i23_fpix00); \
+    vx_store(dstptr + 4*(x+i)+16, i45_fpix00); \
+    vx_store(dstptr + 4*(x+i)+24, i67_fpix00);
+#define CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(DEPTH) \
     for (int i = 0; i < uf; i+=vlanes_32) { \
-        CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(0, 1); \
-        CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(2, 3); \
-        auto i01_pix = v_round(i01_fpix00), i23_pix = v_round(i23_fpix00); \
-        v_pack_store(dstptr + 4*(x+i), v_pack_u(i01_pix, i23_pix)); \
-        CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(4, 5); \
-        CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4_I(6, 7); \
-        auto i45_pix = v_round(i45_fpix00), i67_pix = v_round(i67_fpix00); \
-        v_pack_store(dstptr + 4*(x+i+4), v_pack_u(i45_pix, i67_pix)); \
+        CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0, 1); \
+        CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2, 3); \
+        CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(4, 5); \
+        CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(6, 7); \
+        CV_WARP_SIMD256_STORE_##DEPTH##C4_I(); \
     }
 #define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(ofs) \
     const uint8_t *srcptr##ofs = src + addr[i+ofs]; \
@@ -624,14 +712,48 @@
     i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \
     i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \
     i##ofs##_fpix0 = v_fma(i##ofs##_beta,  v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0);
-#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4() \
+#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_16UC4_I(ofs) \
+    const uint16_t *srcptr##ofs = src + addr[i+ofs]; \
+    v_float32 i##ofs##_fpix0 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs))), \
+              i##ofs##_fpix1 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs+4))), \
+              i##ofs##_fpix2 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs+srcstep))), \
+              i##ofs##_fpix3 = v_cvt_f32(v_reinterpret_as_s32(v_load_expand<4>(srcptr##ofs+srcstep+4))); \
+    v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \
+              i##ofs##_beta  = vx_setall_f32(vbeta[i+ofs]); \
+    i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \
+    i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \
+    i##ofs##_fpix0 = v_fma(i##ofs##_beta,  v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0);
+#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_32FC4_I(ofs) \
+    const float *srcptr##ofs = src + addr[i+ofs]; \
+    v_float32 i##ofs##_fpix0 = v_load<4>(srcptr##ofs), \
+              i##ofs##_fpix1 = v_load<4>(srcptr##ofs+4), \
+              i##ofs##_fpix2 = v_load<4>(srcptr##ofs+srcstep), \
+              i##ofs##_fpix3 = v_load<4>(srcptr##ofs+srcstep+4); \
+    v_float32 i##ofs##_alpha = vx_setall_f32(valpha[i+ofs]), \
+              i##ofs##_beta  = vx_setall_f32(vbeta[i+ofs]); \
+    i##ofs##_fpix0 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix1, i##ofs##_fpix0), i##ofs##_fpix0); \
+    i##ofs##_fpix2 = v_fma(i##ofs##_alpha, v_sub(i##ofs##_fpix3, i##ofs##_fpix2), i##ofs##_fpix2); \
+    i##ofs##_fpix0 = v_fma(i##ofs##_beta,  v_sub(i##ofs##_fpix2, i##ofs##_fpix0), i##ofs##_fpix0);
+#define CV_WARP_SIMDX_STORE_8UC4_I() \
+    auto i01_pix = v_pack_u<4>(v_round(i0_fpix0), v_round(i1_fpix0)), \
+         i23_pix = v_pack_u<4>(v_round(i2_fpix0), v_round(i3_fpix0)); \
+    v_pack_store<8>(dstptr + 4*(x+i),   i01_pix); \
+    v_pack_store<8>(dstptr + 4*(x+i+2), i23_pix);
+#define CV_WARP_SIMDX_STORE_16UC4_I() \
+    auto i01_pix = v_pack_u<4>(v_round(i0_fpix0), v_round(i1_fpix0)), \
+         i23_pix = v_pack_u<4>(v_round(i2_fpix0), v_round(i3_fpix0)); \
+    v_store<8>(dstptr + 4*(x+i),   i01_pix); \
+    v_store<8>(dstptr + 4*(x+i+2), i23_pix);
+#define CV_WARP_SIMDX_STORE_32FC4_I() \
+    v_store<4>(dstptr + 4*(x+i),    i0_fpix0); \
+    v_store<4>(dstptr + 4*(x+i)+4,  i1_fpix0); \
+    v_store<4>(dstptr + 4*(x+i)+8,  i2_fpix0); \
+    v_store<4>(dstptr + 4*(x+i)+12, i3_fpix0);
+#define CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(DEPTH) \
     for (int i = 0; i < uf; i+=4) { \
-        CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(0); \
-        CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(1); \
-        CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(2); \
-        CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4_I(3); \
-        auto i01_pix = v_pack(v_round(i0_fpix0), v_round(i1_fpix0)), \
-             i23_pix = v_pack(v_round(i2_fpix0), v_round(i3_fpix0)); \
-        v_pack_u_store<8>(dstptr + 4*(x+i), i01_pix); \
-        v_pack_u_store<8>(dstptr + 4*(x+i+2), i23_pix); \
+        CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(0); \
+        CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(1); \
+        CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(2); \
+        CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_##DEPTH##C4_I(3); \
+        CV_WARP_SIMDX_STORE_##DEPTH##C4_I(); \
     }
diff --git a/modules/imgproc/src/warp_kernels.simd.hpp b/modules/imgproc/src/warp_kernels.simd.hpp
index b1ad131079..41f994bc24 100644
--- a/modules/imgproc/src/warp_kernels.simd.hpp
+++ b/modules/imgproc/src/warp_kernels.simd.hpp
@@ -298,7 +298,7 @@ void warpAffineLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -418,7 +418,7 @@ void warpAffineLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -547,7 +547,7 @@ void warpAffineLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -590,11 +590,11 @@ void warpAffineLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int
                     vx_store(vbeta, src_y0);
                     vx_store(vbeta+vlanes_32, src_y1);
     #if CV_SIMD256
-                    CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4();
+                    CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(8U);
     #elif CV_SIMD128
-                    CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4();
+                    CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(8U);
     #elif CV_SIMD_SCALABLE
-                    CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4();
+                    CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(8U);
     #endif
                 } else {
                     uint8_t pixbuf[max_uf*4*4];
@@ -660,7 +660,7 @@ void warpAffineLinearInvoker_16UC1(const uint16_t *src_data, size_t src_step, in
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -761,7 +761,7 @@ void warpAffineLinearInvoker_16UC3(const uint16_t *src_data, size_t src_step, in
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -866,7 +866,7 @@ void warpAffineLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, in
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -876,7 +876,6 @@ void warpAffineLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, in
         int32_t addr[max_uf],
                 src_ix[max_uf],
                 src_iy[max_uf];
-        uint16_t pixbuf[max_uf*4*4];
 
         uint16_t bvalbuf[max_uf*4];
         for (int i = 0; i < uf; i++) {
@@ -904,18 +903,26 @@ void warpAffineLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, in
                 CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4);
 
                 if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image
-                    CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 16U);
+                    float valpha[max_uf], vbeta[max_uf];
+                    vx_store(valpha, src_x0);
+                    vx_store(valpha+vlanes_32, src_x1);
+                    vx_store(vbeta, src_y0);
+                    vx_store(vbeta+vlanes_32, src_y1);
+    #if CV_SIMD256
+                    CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(16U);
+    #elif CV_SIMD128
+                    CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(16U);
+    #elif CV_SIMD_SCALABLE
+                    CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(16U);
+    #endif
                 } else {
+                    uint16_t pixbuf[max_uf*4*4];
                     CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 16U);
+                    CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4);
                 }
-
-                CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4);
             }
 #endif // (CV_SIMD || CV_SIMD_SCALABLE)
 
@@ -972,7 +979,7 @@ void warpAffineLinearInvoker_32FC1(const float *src_data, size_t src_step, int s
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -1071,7 +1078,7 @@ void warpAffineLinearInvoker_32FC3(const float *src_data, size_t src_step, int s
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -1176,7 +1183,7 @@ void warpAffineLinearInvoker_32FC4(const float *src_data, size_t src_step, int s
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -1186,7 +1193,6 @@ void warpAffineLinearInvoker_32FC4(const float *src_data, size_t src_step, int s
         int32_t addr[max_uf],
                 src_ix[max_uf],
                 src_iy[max_uf];
-        float pixbuf[max_uf*4*4];
 
         float bvalbuf[max_uf*4];
         for (int i = 0; i < uf; i++) {
@@ -1218,16 +1224,25 @@ void warpAffineLinearInvoker_32FC4(const float *src_data, size_t src_step, int s
                 CV_WARPAFFINE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4);
 
                 if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image
-                    CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 32F);
+                    float valpha[max_uf], vbeta[max_uf];
+                    vx_store(valpha, src_x0);
+                    vx_store(valpha+vlanes_32, src_x1);
+                    vx_store(vbeta, src_y0);
+                    vx_store(vbeta+vlanes_32, src_y1);
+    #if CV_SIMD256
+                    CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(32F);
+    #elif CV_SIMD128
+                    CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(32F);
+    #elif CV_SIMD_SCALABLE
+                    CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(32F);
+    #endif
                 } else {
+                    float pixbuf[max_uf*4*4];
                     CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 32F);
+                    CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4);
                 }
-
-                CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4);
             }
 #endif // (CV_SIMD || CV_SIMD_SCALABLE)
 
@@ -1284,7 +1299,7 @@ void warpAffineLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src_step
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -1391,7 +1406,7 @@ void warpAffineLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src_step
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -1505,7 +1520,7 @@ void warpAffineLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src_step
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -1622,7 +1637,7 @@ void warpPerspectiveLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step,
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -1744,7 +1759,7 @@ void warpPerspectiveLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step,
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -1874,7 +1889,7 @@ void warpPerspectiveLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step,
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -1917,11 +1932,11 @@ void warpPerspectiveLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step,
                     vx_store(vbeta, src_y0);
                     vx_store(vbeta+vlanes_32, src_y1);
     #if CV_SIMD256
-                    CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4();
+                    CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(8U);
     #elif CV_SIMD128
-                    CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4();
+                    CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(8U);
     #elif CV_SIMD_SCALABLE
-                    CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4();
+                    CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(8U);
     #endif
                 } else {
                     uint8_t pixbuf[max_uf*4*4];
@@ -1988,7 +2003,7 @@ void warpPerspectiveLinearInvoker_16UC1(const uint16_t *src_data, size_t src_ste
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -2089,7 +2104,7 @@ void warpPerspectiveLinearInvoker_16UC3(const uint16_t *src_data, size_t src_ste
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -2194,7 +2209,7 @@ void warpPerspectiveLinearInvoker_16UC4(const uint16_t *src_data, size_t src_ste
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -2204,7 +2219,6 @@ void warpPerspectiveLinearInvoker_16UC4(const uint16_t *src_data, size_t src_ste
         int32_t addr[max_uf],
                 src_ix[max_uf],
                 src_iy[max_uf];
-        uint16_t pixbuf[max_uf*4*4];
 
         uint16_t bvalbuf[max_uf*4];
         for (int i = 0; i < uf; i++) {
@@ -2232,18 +2246,26 @@ void warpPerspectiveLinearInvoker_16UC4(const uint16_t *src_data, size_t src_ste
                 CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4);
 
                 if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image
-                    CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 16U);
+                    float valpha[max_uf], vbeta[max_uf];
+                    vx_store(valpha, src_x0);
+                    vx_store(valpha+vlanes_32, src_x1);
+                    vx_store(vbeta, src_y0);
+                    vx_store(vbeta+vlanes_32, src_y1);
+    #if CV_SIMD256
+                    CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(16U);
+    #elif CV_SIMD128
+                    CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(16U);
+    #elif CV_SIMD_SCALABLE
+                    CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(16U);
+    #endif
                 } else {
+                    uint16_t pixbuf[max_uf*4*4];
                     CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 16U);
+                    CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4);
                 }
-
-                CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4);
             }
 #endif // (CV_SIMD || CV_SIMD_SCALABLE)
 
@@ -2301,7 +2323,7 @@ void warpPerspectiveLinearInvoker_32FC1(const float *src_data, size_t src_step,
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -2401,7 +2423,7 @@ void warpPerspectiveLinearInvoker_32FC3(const float *src_data, size_t src_step,
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -2507,7 +2529,7 @@ void warpPerspectiveLinearInvoker_32FC4(const float *src_data, size_t src_step,
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -2517,7 +2539,6 @@ void warpPerspectiveLinearInvoker_32FC4(const float *src_data, size_t src_step,
         int32_t addr[max_uf],
                 src_ix[max_uf],
                 src_iy[max_uf];
-        float pixbuf[max_uf*4*4];
 
         float bvalbuf[max_uf*4];
         for (int i = 0; i < uf; i++) {
@@ -2549,16 +2570,25 @@ void warpPerspectiveLinearInvoker_32FC4(const float *src_data, size_t src_step,
                 CV_WARPPERSPECTIVE_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4);
 
                 if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image
-                    CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 32F);
+                    float valpha[max_uf], vbeta[max_uf];
+                    vx_store(valpha, src_x0);
+                    vx_store(valpha+vlanes_32, src_x1);
+                    vx_store(vbeta, src_y0);
+                    vx_store(vbeta+vlanes_32, src_y1);
+    #if CV_SIMD256
+                    CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(32F);
+    #elif CV_SIMD128
+                    CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(32F);
+    #elif CV_SIMD_SCALABLE
+                    CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(32F);
+    #endif
                 } else {
+                    float pixbuf[max_uf*4*4];
                     CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 32F);
+                    CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4);
                 }
-
-                CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4);
             }
 #endif // (CV_SIMD || CV_SIMD_SCALABLE)
 
@@ -2616,7 +2646,7 @@ void warpPerspectiveLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -2724,7 +2754,7 @@ void warpPerspectiveLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -2838,7 +2868,7 @@ void warpPerspectiveLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -2961,7 +2991,7 @@ void remapLinearInvoker_8UC1(const uint8_t *src_data, size_t src_step, int src_r
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -3100,7 +3130,7 @@ void remapLinearInvoker_8UC3(const uint8_t *src_data, size_t src_step, int src_r
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -3247,7 +3277,7 @@ void remapLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_r
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -3292,11 +3322,11 @@ void remapLinearInvoker_8UC4(const uint8_t *src_data, size_t src_step, int src_r
                     vx_store(vbeta, src_y0);
                     vx_store(vbeta+vlanes_32, src_y1);
     #if CV_SIMD256
-                    CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_8UC4();
+                    CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(8U);
     #elif CV_SIMD128
-                    CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_8UC4();
+                    CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(8U);
     #elif CV_SIMD_SCALABLE
-                    CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_8UC4();
+                    CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(8U);
     #endif
                 } else {
                     uint8_t pixbuf[max_uf*4*4];
@@ -3378,7 +3408,7 @@ void remapLinearInvoker_16UC1(const uint16_t *src_data, size_t src_step, int src
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -3496,7 +3526,7 @@ void remapLinearInvoker_16UC3(const uint16_t *src_data, size_t src_step, int src
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -3618,7 +3648,7 @@ void remapLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -3628,7 +3658,6 @@ void remapLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src
         int32_t addr[max_uf],
                 src_ix[max_uf],
                 src_iy[max_uf];
-        uint16_t pixbuf[max_uf*4*4];
 
         uint16_t bvalbuf[max_uf*4];
         for (int i = 0; i < uf; i++) {
@@ -3658,18 +3687,26 @@ void remapLinearInvoker_16UC4(const uint16_t *src_data, size_t src_step, int src
                 CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4);
 
                 if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image
-                    CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 16U);
+                    float valpha[max_uf], vbeta[max_uf];
+                    vx_store(valpha, src_x0);
+                    vx_store(valpha+vlanes_32, src_x1);
+                    vx_store(vbeta, src_y0);
+                    vx_store(vbeta+vlanes_32, src_y1);
+    #if CV_SIMD256
+                    CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(16U);
+    #elif CV_SIMD128
+                    CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(16U);
+    #elif CV_SIMD_SCALABLE
+                    CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(16U);
+    #endif
                 } else {
+                    uint16_t pixbuf[max_uf*4*4];
                     CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 16U);
+                    CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4);
                 }
-
-                CV_WARP_LINEAR_VECTOR_INTER_LOAD_U16(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_CONVERT_U16F32(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_STORE_F32U16(C4);
             }
 #endif // (CV_SIMD || CV_SIMD_SCALABLE)
 
@@ -3742,7 +3779,7 @@ void remapLinearInvoker_32FC1(const float *src_data, size_t src_step, int src_ro
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -3859,7 +3896,7 @@ void remapLinearInvoker_32FC3(const float *src_data, size_t src_step, int src_ro
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -3982,7 +4019,7 @@ void remapLinearInvoker_32FC4(const float *src_data, size_t src_step, int src_ro
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -3992,7 +4029,6 @@ void remapLinearInvoker_32FC4(const float *src_data, size_t src_step, int src_ro
         int32_t addr[max_uf],
                 src_ix[max_uf],
                 src_iy[max_uf];
-        float pixbuf[max_uf*4*4];
 
         float bvalbuf[max_uf*4];
         for (int i = 0; i < uf; i++) {
@@ -4026,16 +4062,25 @@ void remapLinearInvoker_32FC4(const float *src_data, size_t src_step, int src_ro
                 CV_REMAP_LINEAR_VECTOR_COMPUTE_MAPPED_COORD2(C4);
 
                 if (v_reduce_min(inner_mask) != 0) { // all loaded pixels are completely inside the image
-                    CV_WARP_LINEAR_VECTOR_SHUFFLE_ALLWITHIN(C4, 32F);
+                    float valpha[max_uf], vbeta[max_uf];
+                    vx_store(valpha, src_x0);
+                    vx_store(valpha+vlanes_32, src_x1);
+                    vx_store(vbeta, src_y0);
+                    vx_store(vbeta+vlanes_32, src_y1);
+    #if CV_SIMD256
+                    CV_WARP_SIMD256_LOAD_SHUFFLE_INTER_C4(32F);
+    #elif CV_SIMD128
+                    CV_WARP_SIMD128_LOAD_SHUFFLE_INTER_C4(32F);
+    #elif CV_SIMD_SCALABLE
+                    CV_WARP_SIMDX_LOAD_SHUFFLE_INTER_C4(32F);
+    #endif
                 } else {
+                    float pixbuf[max_uf*4*4];
                     CV_WARP_LINEAR_VECTOR_SHUFFLE_NOTALLWITHIN(C4, 32F);
+                    CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
+                    CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4);
                 }
-
-                CV_WARP_LINEAR_VECTOR_INTER_LOAD_F32(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_CALC_F32(C4);
-
-                CV_WARP_LINEAR_VECTOR_INTER_STORE_F32F32(C4);
             }
 #endif // (CV_SIMD || CV_SIMD_SCALABLE)
 
@@ -4107,7 +4152,7 @@ void remapLinearApproxInvoker_8UC1(const uint8_t *src_data, size_t src_step, int
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -4229,7 +4274,7 @@ void remapLinearApproxInvoker_8UC3(const uint8_t *src_data, size_t src_step, int
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);
@@ -4359,7 +4404,7 @@ void remapLinearApproxInvoker_8UC4(const uint8_t *src_data, size_t src_step, int
         std::array<float, max_vlanes_32> start_indices;
         std::iota(start_indices.data(), start_indices.data() + max_vlanes_32, 0.f);
 
-        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 2),
+        v_uint32 inner_srows = vx_setall_u32((unsigned)srcrows - 1),
                  inner_scols = vx_setall_u32((unsigned)srccols - 1),
                  outer_srows = vx_setall_u32((unsigned)srcrows + 1),
                  outer_scols = vx_setall_u32((unsigned)srccols + 1);