diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index e4d13af1a2..bda1d8558f 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -346,11 +346,37 @@ OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh
 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu)
 OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)
 
+/* Load and zero expand a 4 byte value into the second dword, first is don't care. */
+#if !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    #define _LXSIWZX(out, ptr, T) __asm__ ("lxsiwzx %x0, 0, %1\r\n" : "=wa"(out) : "r" (ptr) : "memory");
+#else
+    /* This is compiler-agnostic, but will introduce an unneeded splat on the critical path. */
+    #define _LXSIWZX(out, ptr, T) out = (T)vec_udword2_sp(*(uint32_t*)(ptr));
+#endif
+
 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
-{ return v_uint32x4(vec_uint4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
+{
+    // Zero-extend the extra 24B instead of unpacking. Usually faster in small kernel
+    // Likewise note, value is zero extended and upper 4 bytes are zero'ed.
+    vec_uchar16 pmu = {8, 12, 12, 12, 9, 12, 12, 12, 10, 12, 12, 12, 11, 12, 12, 12};
+    vec_uchar16 out;
+
+    _LXSIWZX(out, ptr, vec_uchar16);
+    out = vec_perm(out, out, pmu);
+    return v_uint32x4((vec_uint4)out);
+}
 
 inline v_int32x4 v_load_expand_q(const schar* ptr)
-{ return v_int32x4(vec_int4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
+{
+    vec_char16 out;
+    vec_short8 outs;
+    vec_int4 outw;
+
+    _LXSIWZX(out, ptr, vec_char16);
+    outs = vec_unpackl(out);
+    outw = vec_unpackh(outs);
+    return v_int32x4(outw);
+}
 
 /* pack */
 #define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack)    \
diff --git a/modules/imgproc/perf/perf_resize.cpp b/modules/imgproc/perf/perf_resize.cpp
index 0705108503..236955bd66 100644
--- a/modules/imgproc/perf/perf_resize.cpp
+++ b/modules/imgproc/perf/perf_resize.cpp
@@ -7,6 +7,31 @@ namespace opencv_test {
 
 typedef tuple<MatType, Size, Size> MatInfo_Size_Size_t;
 typedef TestBaseWithParam<MatInfo_Size_Size_t> MatInfo_Size_Size;
+typedef tuple<Size,Size> Size_Size_t;
+typedef tuple<MatType, Size_Size_t> MatInfo_SizePair_t;
+typedef TestBaseWithParam<MatInfo_SizePair_t> MatInfo_SizePair;
+
+#define MATTYPE_NE_VALUES CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4,     \
+                          CV_16UC1, CV_16UC2, CV_16UC3, CV_16UC4, \
+                          CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4
+
+// For gradient-ish testing of the other matrix formats
+template<typename T>
+static void fillFPGradient(Mat& img)
+{
+    const int ch = img.channels();
+
+    int r, c, i;
+    for(r=0; r<img.rows; r++)
+    {
+        for(c=0; c<img.cols; c++)
+        {
+            T vals[] = {(T)r, (T)c, (T)(r*c), (T)(r*c/(r+c+1))};
+            T *p = (T*)img.ptr(r, c);
+            for(i=0; i<ch; i++) p[i] = (T)vals[i];
+        }
+    }
+}
 
 PERF_TEST_P(MatInfo_Size_Size, resizeUpLinear,
             testing::Values(
@@ -38,6 +63,33 @@ PERF_TEST_P(MatInfo_Size_Size, resizeUpLinear,
 #endif
 }
 
+PERF_TEST_P(MatInfo_SizePair, resizeUpLinearNonExact,
+            testing::Combine
+                (
+                testing::Values( MATTYPE_NE_VALUES ),
+                testing::Values( Size_Size_t(szVGA, szqHD), Size_Size_t(szVGA, sz720p) )
+                )
+             )
+{
+    int matType = get<0>(GetParam());
+    Size_Size_t sizes = get<1>(GetParam());
+    Size from = get<0>(sizes);
+    Size to = get<1>(sizes);
+
+    cv::Mat src(from, matType), dst(to, matType);
+    switch(src.depth())
+    {
+        case CV_8U: cvtest::fillGradient(src); break;
+        case CV_16U: fillFPGradient<ushort>(src); break;
+        case CV_32F: fillFPGradient<float>(src); break;
+    }
+    declare.in(src).out(dst);
+
+    TEST_CYCLE_MULTIRUN(10) resize(src, dst, to, 0, 0, INTER_LINEAR);
+
+    SANITY_CHECK_NOTHING();
+}
+
 PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear,
             testing::Values(
                 MatInfo_Size_Size_t(CV_8UC1, szVGA, szQVGA),
@@ -80,6 +132,40 @@ PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear,
 #endif
 }
 
+PERF_TEST_P(MatInfo_SizePair, resizeDownLinearNonExact,
+            testing::Combine
+                (
+                testing::Values( MATTYPE_NE_VALUES ),
+                testing::Values
+                    (
+                    Size_Size_t(szVGA, szQVGA),
+                    Size_Size_t(szqHD, szVGA),
+                    Size_Size_t(sz720p, Size(120 * sz720p.width / sz720p.height, 120)),
+                    Size_Size_t(sz720p, szVGA),
+                    Size_Size_t(sz720p, szQVGA)
+                    )
+                )
+            )
+{
+    int matType = get<0>(GetParam());
+    Size_Size_t sizes = get<1>(GetParam());
+    Size from = get<0>(sizes);
+    Size to = get<1>(sizes);
+
+    cv::Mat src(from, matType), dst(to, matType);
+    switch(src.depth())
+    {
+        case CV_8U: cvtest::fillGradient(src); break;
+        case CV_16U: fillFPGradient<ushort>(src); break;
+        case CV_32F: fillFPGradient<float>(src); break;
+    }
+    declare.in(src).out(dst);
+
+    TEST_CYCLE_MULTIRUN(10) resize(src, dst, to, 0, 0, INTER_LINEAR);
+
+    SANITY_CHECK_NOTHING();
+}
+
 
 typedef tuple<MatType, Size, int> MatInfo_Size_Scale_t;
 typedef TestBaseWithParam<MatInfo_Size_Scale_t> MatInfo_Size_Scale;
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index b0283e5ca4..56f06ac4b7 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -1481,10 +1481,320 @@ typedef VResizeNoVec VResizeLanczos4Vec_32f;
 
 #endif
 
+#if CV_SIMD128
+
+template<typename ST, typename DT, typename AT, typename DVT>
+struct HResizeLinearVec_X4
+{
+    int operator()(const uchar** _src, uchar** _dst, int count, const int* xofs,
+        const uchar* _alpha, int, int, int cn, int, int xmax) const
+    {
+        const ST **src = (const ST**)_src;
+        const AT *alpha = (const AT*)_alpha;
+        DT **dst = (DT**)_dst;
+        const int nlanes = 4;
+        const int len0 = xmax & -nlanes;
+        int dx = 0, k = 0;
+
+        for( ; k <= (count - 2); k+=2 )
+        {
+            const ST *S0 = src[k];
+            DT *D0 = dst[k];
+            const ST *S1 = src[k+1];
+            DT *D1 = dst[k+1];
+
+            for( dx = 0; dx < len0; dx += nlanes )
+            {
+                int sx0 = xofs[dx+0];
+                int sx1 = xofs[dx+1];
+                int sx2 = xofs[dx+2];
+                int sx3 = xofs[dx+3];
+                DVT a_even;
+                DVT a_odd;
+
+                v_load_deinterleave(&alpha[dx*2], a_even, a_odd);
+                DVT s0(S0[sx0], S0[sx1], S0[sx2], S0[sx3]);
+                DVT s1(S0[sx0+cn], S0[sx1+cn], S0[sx2+cn], S0[sx3+cn]);
+                DVT s0_u(S1[sx0], S1[sx1], S1[sx2], S1[sx3]);
+                DVT s1_u(S1[sx0+cn], S1[sx1+cn], S1[sx2+cn], S1[sx3+cn]);
+                v_store(&D1[dx], s0_u * a_even + s1_u * a_odd);
+                v_store(&D0[dx], s0 * a_even + s1 * a_odd);
+            }
+        }
+        for( ; k < count; k++ )
+        {
+            const ST *S = src[k];
+            DT *D = dst[k];
+            for( dx = 0; dx < len0; dx += nlanes )
+            {
+                int sx0 = xofs[dx+0];
+                int sx1 = xofs[dx+1];
+                int sx2 = xofs[dx+2];
+                int sx3 = xofs[dx+3];
+                DVT a_even;
+                DVT a_odd;
+
+                v_load_deinterleave(&alpha[dx*2], a_even, a_odd);
+                DVT s0(S[sx0], S[sx1], S[sx2], S[sx3]);
+                DVT s1(S[sx0+cn], S[sx1+cn], S[sx2+cn], S[sx3+cn]);
+                v_store(&D[dx], s0 * a_even + s1 * a_odd);
+            }
+        }
+        return dx;
+    }
+};
+
+struct HResizeLinearVecU8_X4
+{
+    int operator()(const uchar** src, uchar** _dst, int count, const int* xofs,
+        const uchar* _alpha, int, int, int cn, int, int xmax) const
+    {
+        const short *alpha = (const short*)_alpha;
+        int **dst = (int**)_dst;
+        int dx = 0, k = 0;
+
+        if(cn == 1)
+        {
+            const int step = 8;
+            const int len0 = xmax & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 al = v_load(alpha+dx*2);
+                    v_int16x8 ah = v_load(alpha+dx*2+8);
+                    v_uint16x8 sl, sh;
+                    v_expand(v_lut_pairs(S0, xofs+dx), sl, sh);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                    v_expand(v_lut_pairs(S1, xofs+dx), sl, sh);
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 al = v_load(alpha+dx*2);
+                    v_int16x8 ah = v_load(alpha+dx*2+8);
+                    v_uint16x8 sl, sh;
+                    v_expand(v_lut_pairs(S, xofs+dx), sl, sh);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                }
+            }
+        }
+        else if(cn == 2)
+        {
+            const int step = 8;
+            const int len0 = xmax & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 al = v_load(alpha+dx*2);
+                    v_int16x8 ah = v_load(alpha+dx*2+8);
+                    v_uint16x8 sl, sh;
+                    v_expand(v_interleave_pairs(v_lut_quads(S0, xofs+dx)), sl, sh);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                    v_expand(v_interleave_pairs(v_lut_pairs(S1, xofs+dx)), sl, sh);
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 al = v_load(alpha+dx*2);
+                    v_int16x8 ah = v_load(alpha+dx*2+8);
+                    v_uint16x8 sl, sh;
+                    v_expand(v_interleave_pairs(v_lut_quads(S, xofs+dx)), sl, sh);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(sl), al));
+                    v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(sh), ah));
+                }
+            }
+        }
+        else if(cn == 3)
+        {
+            const int step = 4;
+            const int len0 = xmax - step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += 3*step/4 )
+                {
+                    v_int16x8 a = v_load(alpha+dx*2);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S0+xofs[dx]) | (v_load_expand_q(S0+xofs[dx]+cn)<<16)), a));
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S1+xofs[dx]) | (v_load_expand_q(S1+xofs[dx]+cn)<<16)), a));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += 3*step/4 )
+                {
+                    v_int16x8 a = v_load(alpha+dx*2);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S+xofs[dx]) | (v_load_expand_q(S+xofs[dx]+cn)<<16)), a));
+                }
+            }
+        }
+        else if(cn == 4)
+        {
+            const int step = 4;
+            const int len0 = xmax & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 a = v_load(alpha+dx*2);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S0+xofs[dx]))), a));
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S1+xofs[dx]))), a));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 a = v_load(alpha+dx*2);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_interleave_quads(v_load_expand(S+xofs[dx]))), a));
+                }
+            }
+        }
+        else if(cn < 9)
+        {
+            const int step = 8;
+            const int len0 = xmax & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += cn )
+                {
+                    v_int16x8 a0 = v_load(alpha+dx*2);
+                    v_int16x8 a1 = v_load(alpha+dx*2 + 8);
+                    v_uint16x8 s0, s1;
+                    v_zip(v_load_expand(S0+xofs[dx]), v_load_expand(S0+xofs[dx]+cn), s0, s1);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(s0), a0));
+                    v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1));
+                    v_zip(v_load_expand(S1+xofs[dx]), v_load_expand(S1+xofs[dx]+cn), s0, s1);
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(s0), a0));
+                    v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += cn )
+                {
+                    v_int16x8 a0 = v_load(alpha+dx*2);
+                    v_int16x8 a1 = v_load(alpha+dx*2 + 8);
+                    v_uint16x8 s0, s1;
+                    v_zip(v_load_expand(S+xofs[dx]), v_load_expand(S+xofs[dx]+cn), s0, s1);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(s0), a0));
+                    v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(s1), a1));
+                }
+            }
+        }
+        else
+        {
+            const int step = 16;
+            const int len0 = (xmax - cn) & -step;
+            for( ; k <= (count - 2); k+=2 )
+            {
+                const uchar *S0 = src[k];
+                int *D0 = dst[k];
+                const uchar *S1 = src[k+1];
+                int *D1 = dst[k+1];
+
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 a0 = v_load(alpha+dx*2);
+                    v_int16x8 a1 = v_load(alpha+dx*2 + 8);
+                    v_int16x8 a2 = v_load(alpha+dx*2 + 16);
+                    v_int16x8 a3 = v_load(alpha+dx*2 + 24);
+                    v_uint8x16 s01, s23;
+                    v_zip(v_lut(S0, xofs+dx), v_lut(S0+cn, xofs+dx), s01, s23);
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0));
+                    v_store(&D0[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1));
+                    v_store(&D0[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2));
+                    v_store(&D0[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3));
+                    v_zip(v_lut(S1, xofs+dx), v_lut(S1+cn, xofs+dx), s01, s23);
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0));
+                    v_store(&D1[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1));
+                    v_store(&D1[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2));
+                    v_store(&D1[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3));
+                }
+            }
+            for( ; k < count; k++ )
+            {
+                const uchar *S = src[k];
+                int *D = dst[k];
+                for( dx = 0; dx < len0; dx += step )
+                {
+                    v_int16x8 a0 = v_load(alpha+dx*2);
+                    v_int16x8 a1 = v_load(alpha+dx*2 + 8);
+                    v_int16x8 a2 = v_load(alpha+dx*2 + 16);
+                    v_int16x8 a3 = v_load(alpha+dx*2 + 24);
+                    v_uint8x16 s01, s23;
+                    v_zip(v_lut(S, xofs+dx), v_lut(S+cn, xofs+dx), s01, s23);
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_expand_low(s01)), a0));
+                    v_store(&D[dx+4], v_dotprod(v_reinterpret_as_s16(v_expand_high(s01)), a1));
+                    v_store(&D[dx+8], v_dotprod(v_reinterpret_as_s16(v_expand_low(s23)), a2));
+                    v_store(&D[dx+12], v_dotprod(v_reinterpret_as_s16(v_expand_high(s23)), a3));
+                }
+            }
+        }
+        return dx;
+    }
+};
+
+typedef HResizeLinearVec_X4<float,float,float,v_float32x4> HResizeLinearVec_32f;
+typedef HResizeLinearVec_X4<ushort,float,float,v_float32x4> HResizeLinearVec_16u32f;
+typedef HResizeLinearVec_X4<short,float,float,v_float32x4> HResizeLinearVec_16s32f;
+typedef HResizeLinearVecU8_X4 HResizeLinearVec_8u32s;
+
+#else
+
 typedef HResizeNoVec HResizeLinearVec_8u32s;
 typedef HResizeNoVec HResizeLinearVec_16u32f;
 typedef HResizeNoVec HResizeLinearVec_16s32f;
 typedef HResizeNoVec HResizeLinearVec_32f;
+
+#endif
+
 typedef HResizeNoVec HResizeLinearVec_64f;
 
 
@@ -1505,7 +1815,7 @@ struct HResizeLinear
         int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
             xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
 
-        for( k = 0; k <= count - 2; k++ )
+        for( k = 0; k <= count - 2; k+=2 )
         {
             const T *S0 = src[k], *S1 = src[k+1];
             WT *D0 = dst[k], *D1 = dst[k+1];
@@ -1529,7 +1839,7 @@ struct HResizeLinear
         {
             const T *S = src[k];
             WT *D = dst[k];
-            for( dx = 0; dx < xmax; dx++ )
+            for( dx = dx0; dx < xmax; dx++ )
             {
                 int sx = xofs[dx];
                 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];