diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index ca1b3aa9b5..6a0cf1cd08 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -240,6 +240,7 @@ public:
     bool isUMatVector() const;
     bool isMatx() const;
     bool isVector() const;
+    bool isGpuMat() const;
     bool isGpuMatVector() const;
     ~_InputArray();
 
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index 9b2da34e61..617c61ddad 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -157,6 +157,7 @@ inline bool _InputArray::isMatx() const { return kind() == _InputArray::MATX; }
 inline bool _InputArray::isVector() const { return kind() == _InputArray::STD_VECTOR ||
                                                    kind() == _InputArray::STD_BOOL_VECTOR ||
                                                    kind() == _InputArray::STD_ARRAY; }
+inline bool _InputArray::isGpuMat() const { return kind() == _InputArray::CUDA_GPU_MAT; }
 inline bool _InputArray::isGpuMatVector() const { return kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT; }
 
 ////////////////////////////////////////////////////////////////////////////////////////
diff --git a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
index 07473d459e..4c14340722 100644
--- a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
@@ -145,6 +145,18 @@ private:
 #if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
     std::vector<cuda::GpuMat> gpu_dst_pyr_laplace_;
     std::vector<cuda::GpuMat> gpu_dst_band_weights_;
+    std::vector<Point> gpu_tl_points_;
+    std::vector<cuda::GpuMat> gpu_imgs_with_border_;
+    std::vector<std::vector<cuda::GpuMat> > gpu_weight_pyr_gauss_vec_;
+    std::vector<std::vector<cuda::GpuMat> > gpu_src_pyr_laplace_vec_;
+    std::vector<std::vector<cuda::GpuMat> > gpu_ups_;
+    cuda::GpuMat gpu_dst_mask_;
+    cuda::GpuMat gpu_mask_;
+    cuda::GpuMat gpu_img_;
+    cuda::GpuMat gpu_weight_map_;
+    cuda::GpuMat gpu_add_mask_;
+    int gpu_feed_idx_;
+    bool gpu_initialized_;
 #endif
 };
 
diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp
index 5a7fa856c0..5e9f4133ea 100644
--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@@ -221,6 +221,7 @@ MultiBandBlender::MultiBandBlender(int try_gpu, int num_bands, int weight_type)
 
 #if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
     can_use_gpu_ = try_gpu && cuda::getCudaEnabledDeviceCount();
+    gpu_feed_idx_ = 0;
 #else
     (void) try_gpu;
     can_use_gpu_ = false;
@@ -248,6 +249,15 @@ void MultiBandBlender::prepare(Rect dst_roi)
 #if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
     if (can_use_gpu_)
     {
+        gpu_initialized_ = false;
+        gpu_feed_idx_ = 0;
+
+        gpu_tl_points_.clear();
+        gpu_weight_pyr_gauss_vec_.clear();
+        gpu_src_pyr_laplace_vec_.clear();
+        gpu_ups_.clear();
+        gpu_imgs_with_border_.clear();
+
         gpu_dst_pyr_laplace_.resize(num_bands_ + 1);
         gpu_dst_pyr_laplace_[0].create(dst_roi.size(), CV_16SC3);
         gpu_dst_pyr_laplace_[0].setTo(Scalar::all(0));
@@ -320,7 +330,37 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
     int64 t = getTickCount();
 #endif
 
-    UMat img = _img.getUMat();
+    UMat img;
+
+#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
+    // If using gpu save the top left coordinate when running first time after prepare
+    if (can_use_gpu_)
+    {
+        if (!gpu_initialized_)
+        {
+            gpu_tl_points_.push_back(tl);
+        }
+        else
+        {
+            tl = gpu_tl_points_[gpu_feed_idx_];
+        }
+    }
+    // If _img is not a GpuMat get it as UMat from the InputArray object.
+    // If it is GpuMat make a dummy object with right dimensions but no data and
+    // get _img as a GpuMat
+    if (!_img.isGpuMat())
+#endif
+    {
+        img = _img.getUMat();
+    }
+#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
+    else
+    {
+        gpu_img_ = _img.getGpuMat();
+        img = UMat(gpu_img_.rows, gpu_img_.cols, gpu_img_.type());
+    }
+#endif
+
     CV_Assert(img.type() == CV_16SC3 || img.type() == CV_8UC3);
     CV_Assert(mask.type() == CV_8U);
 
@@ -357,42 +397,63 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
 #if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
     if (can_use_gpu_)
     {
+        if (!gpu_initialized_)
+        {
+            gpu_imgs_with_border_.push_back(cuda::GpuMat());
+            gpu_weight_pyr_gauss_vec_.push_back(std::vector<cuda::GpuMat>(num_bands_+1));
+            gpu_src_pyr_laplace_vec_.push_back(std::vector<cuda::GpuMat>(num_bands_+1));
+            gpu_ups_.push_back(std::vector<cuda::GpuMat>(num_bands_));
+        }
+
+        // If _img is not GpuMat upload it to gpu else gpu_img_ was set already
+        if (!_img.isGpuMat())
+        {
+            gpu_img_.upload(img);
+        }
+
         // Create the source image Laplacian pyramid
-        cuda::GpuMat gpu_img;
-        gpu_img.upload(img);
-        cuda::GpuMat img_with_border;
-        cuda::copyMakeBorder(gpu_img, img_with_border, top, bottom, left, right, BORDER_REFLECT);
-        std::vector<cuda::GpuMat> gpu_src_pyr_laplace(num_bands_ + 1);
-        img_with_border.convertTo(gpu_src_pyr_laplace[0], CV_16S);
+        cuda::copyMakeBorder(gpu_img_, gpu_imgs_with_border_[gpu_feed_idx_], top, bottom,
+                             left, right, BORDER_REFLECT);
+        gpu_imgs_with_border_[gpu_feed_idx_].convertTo(gpu_src_pyr_laplace_vec_[gpu_feed_idx_][0], CV_16S);
         for (int i = 0; i < num_bands_; ++i)
-            cuda::pyrDown(gpu_src_pyr_laplace[i], gpu_src_pyr_laplace[i + 1]);
+            cuda::pyrDown(gpu_src_pyr_laplace_vec_[gpu_feed_idx_][i],
+                          gpu_src_pyr_laplace_vec_[gpu_feed_idx_][i + 1]);
         for (int i = 0; i < num_bands_; ++i)
         {
-            cuda::GpuMat up;
-            cuda::pyrUp(gpu_src_pyr_laplace[i + 1], up);
-            cuda::subtract(gpu_src_pyr_laplace[i], up, gpu_src_pyr_laplace[i]);
+            cuda::pyrUp(gpu_src_pyr_laplace_vec_[gpu_feed_idx_][i + 1], gpu_ups_[gpu_feed_idx_][i]);
+            cuda::subtract(gpu_src_pyr_laplace_vec_[gpu_feed_idx_][i],
+                           gpu_ups_[gpu_feed_idx_][i],
+                           gpu_src_pyr_laplace_vec_[gpu_feed_idx_][i]);
         }
 
-        // Create the weight map Gaussian pyramid
-        cuda::GpuMat gpu_mask;
-        gpu_mask.upload(mask);
-        cuda::GpuMat weight_map;
-        std::vector<cuda::GpuMat> gpu_weight_pyr_gauss(num_bands_ + 1);
+        // Create the weight map Gaussian pyramid only if not yet initialized
+        if (!gpu_initialized_)
+        {
+            if (mask.isGpuMat())
+            {
+                gpu_mask_ = mask.getGpuMat();
+            }
+            else
+            {
+                gpu_mask_.upload(mask);
+            }
 
-        if (weight_type_ == CV_32F)
-        {
-            gpu_mask.convertTo(weight_map, CV_32F, 1. / 255.);
+            if (weight_type_ == CV_32F)
+            {
+                gpu_mask_.convertTo(gpu_weight_map_, CV_32F, 1. / 255.);
+            }
+            else // weight_type_ == CV_16S
+            {
+                gpu_mask_.convertTo(gpu_weight_map_, CV_16S);
+                cuda::compare(gpu_mask_, 0, gpu_add_mask_, CMP_NE);
+                cuda::add(gpu_weight_map_, Scalar::all(1), gpu_weight_map_, gpu_add_mask_);
+            }
+            cuda::copyMakeBorder(gpu_weight_map_, gpu_weight_pyr_gauss_vec_[gpu_feed_idx_][0], top,
+                                 bottom, left, right, BORDER_CONSTANT);
+            for (int i = 0; i < num_bands_; ++i)
+                cuda::pyrDown(gpu_weight_pyr_gauss_vec_[gpu_feed_idx_][i],
+                              gpu_weight_pyr_gauss_vec_[gpu_feed_idx_][i + 1]);
         }
-        else // weight_type_ == CV_16S
-        {
-            gpu_mask.convertTo(weight_map, CV_16S);
-            cuda::GpuMat add_mask;
-            cuda::compare(gpu_mask, 0, add_mask, CMP_NE);
-            cuda::add(weight_map, Scalar::all(1), weight_map, add_mask);
-        }
-        cuda::copyMakeBorder(weight_map, gpu_weight_pyr_gauss[0], top, bottom, left, right, BORDER_CONSTANT);
-        for (int i = 0; i < num_bands_; ++i)
-            cuda::pyrDown(gpu_weight_pyr_gauss[i], gpu_weight_pyr_gauss[i + 1]);
 
         int y_tl = tl_new.y - dst_roi_.y;
         int y_br = br_new.y - dst_roi_.y;
@@ -403,9 +464,9 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
         for (int i = 0; i <= num_bands_; ++i)
         {
             Rect rc(x_tl, y_tl, x_br - x_tl, y_br - y_tl);
-            cuda::GpuMat &_src_pyr_laplace = gpu_src_pyr_laplace[i];
+            cuda::GpuMat &_src_pyr_laplace = gpu_src_pyr_laplace_vec_[gpu_feed_idx_][i];
             cuda::GpuMat _dst_pyr_laplace = gpu_dst_pyr_laplace_[i](rc);
-            cuda::GpuMat &_weight_pyr_gauss = gpu_weight_pyr_gauss[i];
+            cuda::GpuMat &_weight_pyr_gauss = gpu_weight_pyr_gauss_vec_[gpu_feed_idx_][i];
             cuda::GpuMat _dst_band_weights = gpu_dst_band_weights_[i](rc);
 
             using namespace cv::cuda::device::blend;
@@ -420,6 +481,7 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
             x_tl /= 2; y_tl /= 2;
             x_br /= 2; y_br /= 2;
         }
+        ++gpu_feed_idx_;
         return;
     }
 #endif
@@ -445,7 +507,7 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
     UMat weight_map;
     std::vector<UMat> weight_pyr_gauss(num_bands_ + 1);
 
-    if(weight_type_ == CV_32F)
+    if (weight_type_ == CV_32F)
     {
         mask.getUMat().convertTo(weight_map, CV_32F, 1./255.);
     }
@@ -486,7 +548,7 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
             Mat _dst_pyr_laplace = dst_pyr_laplace_[i](rc).getMat(ACCESS_RW);
             Mat _weight_pyr_gauss = weight_pyr_gauss[i].getMat(ACCESS_READ);
             Mat _dst_band_weights = dst_band_weights_[i](rc).getMat(ACCESS_RW);
-            if(weight_type_ == CV_32F)
+            if (weight_type_ == CV_32F)
             {
                 for (int y = 0; y < rc.height; ++y)
                 {
@@ -540,11 +602,15 @@ void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
 
 void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
 {
-    cv::UMat dst_band_weights_0;
     Rect dst_rc(0, 0, dst_roi_final_.width, dst_roi_final_.height);
 #if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
     if (can_use_gpu_)
     {
+        if (!gpu_initialized_)
+        {
+            gpu_ups_.push_back(std::vector<cuda::GpuMat>(num_bands_+1));
+        }
+
         for (int i = 0; i <= num_bands_; ++i)
         {
             cuda::GpuMat dst_i = gpu_dst_pyr_laplace_[i];
@@ -564,20 +630,50 @@ void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
         // Restore image from Laplacian pyramid
         for (size_t i = num_bands_; i > 0; --i)
         {
-            cuda::GpuMat up;
-            cuda::pyrUp(gpu_dst_pyr_laplace_[i], up);
-            cuda::add(up, gpu_dst_pyr_laplace_[i - 1], gpu_dst_pyr_laplace_[i - 1]);
+            cuda::pyrUp(gpu_dst_pyr_laplace_[i], gpu_ups_[gpu_ups_.size()-1][num_bands_-i]);
+            cuda::add(gpu_ups_[gpu_ups_.size()-1][num_bands_-i],
+                      gpu_dst_pyr_laplace_[i - 1],
+                      gpu_dst_pyr_laplace_[i - 1]);
         }
 
-        gpu_dst_pyr_laplace_[0](dst_rc).download(dst_);
-        gpu_dst_band_weights_[0].download(dst_band_weights_0);
+        // If dst is GpuMat do masking on gpu and return dst as a GpuMat
+        // else download the image to cpu and return it as an ordinary Mat
+        if (dst.isGpuMat())
+        {
+            cuda::GpuMat &gpu_dst = dst.getGpuMatRef();
 
-        gpu_dst_pyr_laplace_.clear();
-        gpu_dst_band_weights_.clear();
+            cuda::compare(gpu_dst_band_weights_[0](dst_rc), WEIGHT_EPS, gpu_dst_mask_, CMP_GT);
+
+            cuda::compare(gpu_dst_mask_, 0, gpu_mask_, CMP_EQ);
+
+            gpu_dst_pyr_laplace_[0](dst_rc).setTo(Scalar::all(0), gpu_mask_);
+            gpu_dst_pyr_laplace_[0](dst_rc).convertTo(gpu_dst, CV_16S);
+
+        }
+        else
+        {
+            gpu_dst_pyr_laplace_[0](dst_rc).download(dst_);
+            Mat dst_band_weights_0;
+            gpu_dst_band_weights_[0].download(dst_band_weights_0);
+
+            compare(dst_band_weights_0(dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT);
+            Blender::blend(dst, dst_mask);
+        }
+
+        // Set destination Mats to 0 so new image can be blended
+        for (size_t i = 0; i < num_bands_ + 1; ++i)
+        {
+            gpu_dst_band_weights_[i].setTo(0);
+            gpu_dst_pyr_laplace_[i].setTo(Scalar::all(0));
+        }
+        gpu_feed_idx_ = 0;
+        gpu_initialized_ = true;
     }
     else
 #endif
     {
+        cv::UMat dst_band_weights_0;
+
         for (int i = 0; i <= num_bands_; ++i)
             normalizeUsingWeightMap(dst_band_weights_[i], dst_pyr_laplace_[i]);
 
@@ -588,11 +684,11 @@ void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
 
         dst_pyr_laplace_.clear();
         dst_band_weights_.clear();
+
+        compare(dst_band_weights_0(dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT);
+
+        Blender::blend(dst, dst_mask);
     }
-
-    compare(dst_band_weights_0(dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT);
-
-    Blender::blend(dst, dst_mask);
 }
 
 
diff --git a/modules/stitching/test/test_blenders.cuda.cpp b/modules/stitching/test/test_blenders.cuda.cpp
index 5d556febe9..0a3f61e2e4 100644
--- a/modules/stitching/test/test_blenders.cuda.cpp
+++ b/modules/stitching/test/test_blenders.cuda.cpp
@@ -50,12 +50,16 @@ namespace opencv_test { namespace {
         detail::MultiBandBlender blender(try_cuda, 5);
 
         blender.prepare(Rect(0, 0, max(im1.cols, im2.cols), max(im1.rows, im2.rows)));
-        blender.feed(im1, mask1, Point(0,0));
-        blender.feed(im2, mask2, Point(0,0));
 
-        Mat result_s, result_mask;
-        blender.blend(result_s, result_mask);
-        result_s.convertTo(result, CV_8U);
+        // If using cuda try blending multiple times without calling prepare inbetween
+        for (int i = 0; i < (try_cuda ? 10 : 1); ++i) {
+            blender.feed(im1, mask1, Point(0, 0));
+            blender.feed(im2, mask2, Point(0, 0));
+
+            Mat result_s, result_mask;
+            blender.blend(result_s, result_mask);
+            result_s.convertTo(result, CV_8U);
+        }
     }
 
 TEST(CUDA_MultiBandBlender, Accuracy)