Merge pull request #13694 from dkurt:dnn_ie_async

Asynchronous API from Intel's Inference Engine (#13694) * Add forwardAsync for asynchronous mode from Intel's Inference Engine * Python test for forwardAsync * Replace Future_Mat to AsyncMat * Shadow AsyncMat * Isolate InferRequest callback * Manage exceptions in Async API of IE
2025-07-24 14:06:27 +08:00 · 2019-04-19 21:01:19 +03:00 · 2019-04-19 21:01:19 +03:00 · a5c92c2029
commit a5c92c2029
parent 3abae3c511
8 changed files with 503 additions and 82 deletions
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -44,11 +44,14 @@

 #include <vector>
 #include <opencv2/core.hpp>
+#ifdef CV_CXX11
+#include <future>
+#endif

 #if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_EXPERIMENTAL_NS
-#define CV__DNN_EXPERIMENTAL_NS_BEGIN namespace experimental_dnn_34_v11 {
+#define CV__DNN_EXPERIMENTAL_NS_BEGIN namespace experimental_dnn_34_v12 {
 #define CV__DNN_EXPERIMENTAL_NS_END }
-namespace cv { namespace dnn { namespace experimental_dnn_34_v11 { } using namespace experimental_dnn_34_v11; }}
+namespace cv { namespace dnn { namespace experimental_dnn_34_v12 { } using namespace experimental_dnn_34_v12; }}
 #else
 #define CV__DNN_EXPERIMENTAL_NS_BEGIN
 #define CV__DNN_EXPERIMENTAL_NS_END
@ -64,6 +67,18 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN

    typedef std::vector<int> MatShape;

+#if defined(CV_CXX11) || defined(CV_DOXYGEN)
+    typedef std::future<Mat> AsyncMat;
+#else
+    // Just a workaround for bindings.
+    struct AsyncMat
+    {
+        Mat get() { return Mat(); }
+        void wait() const {}
+        size_t wait_for(size_t milliseconds) const { CV_UNUSED(milliseconds); return -1; }
+    };
+#endif
+
    /**
     * @brief Enum of computation backends supported by layers.
     * @see Net::setPreferableBackend
@ -75,7 +90,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        //! DNN_BACKEND_OPENCV otherwise.
        DNN_BACKEND_DEFAULT,
        DNN_BACKEND_HALIDE,
-        DNN_BACKEND_INFERENCE_ENGINE,
+        DNN_BACKEND_INFERENCE_ENGINE,  //!< Intel's Inference Engine computational backend.
        DNN_BACKEND_OPENCV
    };

@ -89,8 +104,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        DNN_TARGET_OPENCL,
        DNN_TARGET_OPENCL_FP16,
        DNN_TARGET_MYRIAD,
-        //! FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
-        DNN_TARGET_FPGA
+        DNN_TARGET_FPGA  //!< FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
    };

    CV_EXPORTS std::vector< std::pair<Backend, Target> > getAvailableBackends();
@ -462,6 +476,15 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         */
        CV_WRAP Mat forward(const String& outputName = String());

+        /** @brief Runs forward pass to compute output of layer with name @p outputName.
+         *  @param outputName name for layer which output is needed to get
+         *  @details By default runs forward pass for the whole network.
+         *
+         *  This is an asynchronous version of forward(const String&).
+         *  dnn::DNN_BACKEND_INFERENCE_ENGINE backend is required.
+         */
+        CV_WRAP AsyncMat forwardAsync(const String& outputName = String());
+
        /** @brief Runs forward pass to compute output of layer with name @p outputName.
         *  @param outputBlobs contains all output blobs for specified layer.
         *  @param outputName name for layer which output is needed to get
--- a/modules/dnn/misc/python/pyopencv_dnn.hpp
+++ b/modules/dnn/misc/python/pyopencv_dnn.hpp
@ -2,7 +2,13 @@
 typedef dnn::DictValue LayerId;
 typedef std::vector<dnn::MatShape> vector_MatShape;
 typedef std::vector<std::vector<dnn::MatShape> > vector_vector_MatShape;
-
+#ifdef CV_CXX11
+typedef std::chrono::milliseconds chrono_milliseconds;
+typedef std::future_status AsyncMatStatus;
+#else
+typedef size_t chrono_milliseconds;
+typedef size_t AsyncMatStatus;
+#endif

 template<>
 bool pyopencv_to(PyObject *o, dnn::DictValue &dv, const char *name)
@ -40,6 +46,46 @@ bool pyopencv_to(PyObject *o, std::vector<Mat> &blobs, const char *name) //requi
  return pyopencvVecConverter<Mat>::to(o, blobs, ArgInfo(name, false));
 }

+#ifdef CV_CXX11
+
+template<>
+PyObject* pyopencv_from(const std::future<Mat>& f_)
+{
+    std::future<Mat>& f = const_cast<std::future<Mat>&>(f_);
+    Ptr<cv::dnn::AsyncMat> p(new std::future<Mat>(std::move(f)));
+    return pyopencv_from(p);
+}
+
+template<>
+PyObject* pyopencv_from(const std::future_status& status)
+{
+    return pyopencv_from((int)status);
+}
+
+template<>
+bool pyopencv_to(PyObject* src, std::chrono::milliseconds& dst, const char* name)
+{
+    size_t millis = 0;
+    if (pyopencv_to(src, millis, name))
+    {
+        dst = std::chrono::milliseconds(millis);
+        return true;
+    }
+    else
+        return false;
+}
+
+#else
+
+template<>
+PyObject* pyopencv_from(const cv::dnn::AsyncMat&)
+{
+    CV_Error(Error::StsNotImplemented, "C++11 is required.");
+    return 0;
+}
+
+#endif  // CV_CXX11
+
 template<typename T>
 PyObject* pyopencv_from(const dnn::DictValue &dv)
 {
--- a/modules/dnn/misc/python/shadow_async_mat.hpp
+++ b/modules/dnn/misc/python/shadow_async_mat.hpp
@ -0,0 +1,22 @@
+#error This is a shadow header file, which is not intended for processing by any compiler. \
+       Only bindings parser should handle this file.
+
+namespace cv { namespace dnn {
+
+class CV_EXPORTS_W AsyncMat
+{
+public:
+    //! Wait for Mat object readiness and return it.
+    CV_WRAP Mat get();
+
+    //! Wait for Mat object readiness.
+    CV_WRAP void wait() const;
+
+    /** @brief Wait for Mat object readiness specific amount of time.
+     *  @param timeout Timeout in milliseconds
+     *  @returns [std::future_status](https://en.cppreference.com/w/cpp/thread/future_status)
+     */
+    CV_WRAP AsyncMatStatus wait_for(std::chrono::milliseconds timeout) const;
+};
+
+}}
--- a/modules/dnn/misc/python/test/test_dnn.py
+++ b/modules/dnn/misc/python/test/test_dnn.py
@ -5,8 +5,8 @@ import numpy as np

 from tests_common import NewOpenCVTests, unittest

-def normAssert(test, a, b, lInf=1e-5):
-    test.assertLess(np.max(np.abs(a - b)), lInf)
+def normAssert(test, a, b, msg=None, lInf=1e-5):
+    test.assertLess(np.max(np.abs(a - b)), lInf, msg)

 def inter_area(box1, box2):
    x_min, x_max = max(box1[0], box2[0]), min(box1[2], box2[2])
@ -53,53 +53,6 @@ def normAssertDetections(test, ref, out, confThreshold=0.0, scores_diff=1e-5, bo
    if errMsg:
        test.fail(errMsg)

-
-# Returns a simple one-layer network created from Caffe's format
-def getSimpleNet():
-    prototxt = """
-        name: "simpleNet"
-        input: "data"
-        layer {
-          type: "Identity"
-          name: "testLayer"
-          top: "testLayer"
-          bottom: "data"
-        }
-    """
-    return cv.dnn.readNetFromCaffe(bytearray(prototxt, 'utf8'))
-
-
-def testBackendAndTarget(backend, target):
-    net = getSimpleNet()
-    net.setPreferableBackend(backend)
-    net.setPreferableTarget(target)
-    inp = np.random.standard_normal([1, 2, 3, 4]).astype(np.float32)
-    try:
-        net.setInput(inp)
-        net.forward()
-    except BaseException as e:
-        return False
-    return True
-
-
-haveInfEngine = testBackendAndTarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU)
-dnnBackendsAndTargets = [
-    [cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU],
-]
-
-if haveInfEngine:
-    dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU])
-    if testBackendAndTarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD):
-        dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD])
-
-if cv.ocl.haveOpenCL() and cv.ocl.useOpenCL():
-    dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL])
-    dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL_FP16])
-    if haveInfEngine and cv.ocl_Device.getDefault().isIntel():
-        dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL])
-        dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16])
-
-
 def printParams(backend, target):
    backendNames = {
        cv.dnn.DNN_BACKEND_OPENCV: 'OCV',
@ -116,8 +69,44 @@ def printParams(backend, target):

 class dnn_test(NewOpenCVTests):

+    def __init__(self, *args, **kwargs):
+        super(dnn_test, self).__init__(*args, **kwargs)
+        self.dnnBackendsAndTargets = [
+            [cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU],
+        ]
+
+        if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU):
+            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU])
+        if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD):
+            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD])
+
+        if cv.ocl.haveOpenCL() and cv.ocl.useOpenCL():
+            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL])
+            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL_FP16])
+            if cv.ocl_Device.getDefault().isIntel():
+                if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL):
+                    self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL])
+                if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16):
+                    self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16])
+
    def find_dnn_file(self, filename, required=True):
-        return self.find_file(filename, [os.environ.get('OPENCV_DNN_TEST_DATA_PATH', os.getcwd())], required=required)
+        return self.find_file(filename, [os.environ.get('OPENCV_DNN_TEST_DATA_PATH', os.getcwd()),
+                                         os.environ['OPENCV_TEST_DATA_PATH']],
+                              required=required)
+
+    def checkIETarget(self, backend, target):
+        proto = self.find_dnn_file('dnn/layers/layer_convolution.prototxt', required=True)
+        model = self.find_dnn_file('dnn/layers/layer_convolution.caffemodel', required=True)
+        net = cv.dnn.readNet(proto, model)
+        net.setPreferableBackend(backend)
+        net.setPreferableTarget(target)
+        inp = np.random.standard_normal([1, 2, 10, 11]).astype(np.float32)
+        try:
+            net.setInput(inp)
+            net.forward()
+        except BaseException as e:
+            return False
+        return True

    def test_blobFromImage(self):
        np.random.seed(324)
@ -148,7 +137,7 @@ class dnn_test(NewOpenCVTests):

    def test_face_detection(self):
        testdata_required = bool(os.environ.get('OPENCV_DNN_TEST_REQUIRE_TESTDATA', False))
-        proto = self.find_dnn_file('dnn/opencv_face_detector.prototxt2', required=testdata_required)
+        proto = self.find_dnn_file('dnn/opencv_face_detector.prototxt', required=testdata_required)
        model = self.find_dnn_file('dnn/opencv_face_detector.caffemodel', required=testdata_required)
        if proto is None or model is None:
            raise unittest.SkipTest("Missing DNN test files (dnn/opencv_face_detector.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
@ -164,7 +153,7 @@ class dnn_test(NewOpenCVTests):
               [0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427,  0.5347801]]

        print('\n')
-        for backend, target in dnnBackendsAndTargets:
+        for backend, target in self.dnnBackendsAndTargets:
            printParams(backend, target)

            net = cv.dnn.readNet(proto, model)
@ -178,5 +167,52 @@ class dnn_test(NewOpenCVTests):

            normAssertDetections(self, ref, out, 0.5, scoresDiff, iouDiff)

+    def test_async(self):
+        timeout = 5000  # in milliseconds
+        testdata_required = bool(os.environ.get('OPENCV_DNN_TEST_REQUIRE_TESTDATA', False))
+        proto = self.find_dnn_file('dnn/layers/layer_convolution.prototxt', required=testdata_required)
+        model = self.find_dnn_file('dnn/layers/layer_convolution.caffemodel', required=testdata_required)
+        if proto is None or model is None:
+            raise unittest.SkipTest("Missing DNN test files (dnn/layers/layer_convolution.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+
+        print('\n')
+        for backend, target in self.dnnBackendsAndTargets:
+            if backend != cv.dnn.DNN_BACKEND_INFERENCE_ENGINE:
+                continue
+
+            printParams(backend, target)
+
+            netSync = cv.dnn.readNet(proto, model)
+            netSync.setPreferableBackend(backend)
+            netSync.setPreferableTarget(target)
+
+            netAsync = cv.dnn.readNet(proto, model)
+            netAsync.setPreferableBackend(backend)
+            netAsync.setPreferableTarget(target)
+
+            # Generate inputs
+            numInputs = 10
+            inputs = []
+            for _ in range(numInputs):
+                inputs.append(np.random.standard_normal([2, 6, 75, 113]).astype(np.float32))
+
+            # Run synchronously
+            refs = []
+            for i in range(numInputs):
+                netSync.setInput(inputs[i])
+                refs.append(netSync.forward())
+
+            # Run asynchronously. To make test more robust, process inputs in the reversed order.
+            outs = []
+            for i in reversed(range(numInputs)):
+                netAsync.setInput(inputs[i])
+                outs.insert(0, netAsync.forwardAsync())
+
+            for i in reversed(range(numInputs)):
+                if outs[i].wait_for(timeout) == 1:
+                    self.fail("Timeout")
+                normAssert(self, refs[i], outs[i].get(), 'Index: %d' % i, 1e-10)
+
+
 if __name__ == '__main__':
    NewOpenCVTests.bootstrap()
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -1030,6 +1030,7 @@ struct Net::Impl
        lastLayerId = 0;
        netWasAllocated = false;
        fusion = true;
+        isAsync = false;
        preferableBackend = DNN_BACKEND_DEFAULT;
        preferableTarget = DNN_TARGET_CPU;
        skipInfEngineInit = false;
@ -1051,6 +1052,7 @@ struct Net::Impl

    bool netWasAllocated;
    bool fusion;
+    bool isAsync;
    std::vector<int64> layersTimings;
    Mat output_blob;

@ -2258,6 +2260,9 @@ struct Net::Impl
            std::map<int, Ptr<BackendNode> >::iterator it = ld.backendNodes.find(preferableBackend);
            if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty())
            {
+                if (isAsync)
+                    CV_Error(Error::StsNotImplemented, "Default implementation fallbacks in asynchronous mode");
+
                if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
                {
                    std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
@ -2413,7 +2418,7 @@ struct Net::Impl
                }
                else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
                {
-                    forwardInfEngine(node);
+                    forwardInfEngine(ld.outputBlobsWrappers, node, isAsync);
                }
                else
                {
@ -2459,15 +2464,6 @@ struct Net::Impl
        forwardLayer(ld);
    }

-    void forwardAll()
-    {
-        CV_TRACE_FUNCTION();
-
-        MapIdToLayerData::reverse_iterator last_layer = layers.rbegin();
-        CV_Assert(last_layer != layers.rend());
-        forwardToLayer(last_layer->second, true);
-    }
-
    void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
    {
        std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
@ -2558,6 +2554,42 @@ struct Net::Impl
    {
        return getBlob(getPinByAlias(outputName));
    }
+
+#ifdef CV_CXX11
+    std::future<Mat> getBlobAsync(const LayerPin& pin)
+    {
+        CV_TRACE_FUNCTION();
+#ifdef HAVE_INF_ENGINE
+        if (!pin.valid())
+            CV_Error(Error::StsObjectNotFound, "Requested blob not found");
+
+        LayerData &ld = layers[pin.lid];
+        if ((size_t)pin.oid >= ld.outputBlobs.size())
+        {
+            CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %d outputs, "
+                                           "the #%d was requested", ld.name.c_str(),
+                                           ld.outputBlobs.size(), pin.oid));
+        }
+        if (preferableTarget != DNN_TARGET_CPU)
+        {
+            CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
+            // Transfer data to CPU if it's require.
+            ld.outputBlobsWrappers[pin.oid]->copyToHost();
+        }
+        CV_Assert(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE);
+
+        Ptr<InfEngineBackendWrapper> wrapper = ld.outputBlobsWrappers[pin.oid].dynamicCast<InfEngineBackendWrapper>();
+        return std::move(wrapper->futureMat);
+#else
+        CV_Error(Error::StsNotImplemented, "DNN_BACKEND_INFERENCE_ENGINE backend is required");
+#endif
+    }
+
+    std::future<Mat> getBlobAsync(String outputName)
+    {
+        return getBlobAsync(getPinByAlias(outputName));
+    }
+#endif  // CV_CXX11
 };

 Net::Net() : impl(new Net::Impl)
@ -2681,6 +2713,31 @@ Mat Net::forward(const String& outputName)
    return impl->getBlob(layerName);
 }

+AsyncMat Net::forwardAsync(const String& outputName)
+{
+    CV_TRACE_FUNCTION();
+#ifdef CV_CXX11
+    if (impl->preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
+        CV_Error(Error::StsNotImplemented, "Asynchronous forward for backend which is different from DNN_BACKEND_INFERENCE_ENGINE");
+
+    String layerName = outputName;
+
+    if (layerName.empty())
+        layerName = getLayerNames().back();
+
+    std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
+    impl->setUpNet(pins);
+
+    impl->isAsync = true;
+    impl->forwardToLayer(impl->getLayerData(layerName));
+    impl->isAsync = false;
+
+    return impl->getBlobAsync(layerName);
+#else
+    CV_Error(Error::StsNotImplemented, "Asynchronous forward without C++11");
+#endif  // CV_CXX11
+}
+
 void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
 {
    CV_TRACE_FUNCTION();
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@ -168,7 +168,6 @@ void InfEngineBackendNet::init(int targetId)
        const std::string& name = it.first;
        auto blobIt = allBlobs.find(name);
        CV_Assert(blobIt != allBlobs.end());
-        inpBlobs[name] = blobIt->second;
        it.second->setPrecision(blobIt->second->precision());
    }
    for (const auto& it : cnn.getOutputsInfo())
@ -176,7 +175,6 @@ void InfEngineBackendNet::init(int targetId)
        const std::string& name = it.first;
        auto blobIt = allBlobs.find(name);
        CV_Assert(blobIt != allBlobs.end());
-        outBlobs[name] = blobIt->second;
        it.second->setPrecision(blobIt->second->precision());  // Should be always FP32
    }

@ -288,6 +286,24 @@ InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::La
    return wrapToInfEngineBlob(m, reversedShape, layout);
 }

+InferenceEngine::Blob::Ptr cloneBlob(const InferenceEngine::Blob::Ptr& blob)
+{
+    InferenceEngine::Precision precision = blob->precision();
+    InferenceEngine::Blob::Ptr copy;
+    if (precision == InferenceEngine::Precision::FP32)
+    {
+        copy = InferenceEngine::make_shared_blob<float>(precision, blob->layout(), blob->dims());
+    }
+    else if (precision == InferenceEngine::Precision::U8)
+    {
+        copy = InferenceEngine::make_shared_blob<uint8_t>(precision, blob->layout(), blob->dims());
+    }
+    else
+        CV_Error(Error::StsNotImplemented, "Unsupported blob precision");
+    copy->allocate();
+    return copy;
+}
+
 InferenceEngine::DataPtr infEngineDataNode(const Ptr<BackendWrapper>& ptr)
 {
    CV_Assert(!ptr.empty());
@ -800,9 +816,6 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
        plugin = InferenceEngine::InferencePlugin(enginePtr);

        netExec = plugin.LoadNetwork(net, {});
-        infRequest = netExec.CreateInferRequest();
-        infRequest.SetInput(inpBlobs);
-        infRequest.SetOutput(outBlobs);
    }
    catch (const std::exception& ex)
    {
@ -828,9 +841,116 @@ void InfEngineBackendNet::addBlobs(const std::vector<Ptr<BackendWrapper> >& ptrs
    }
 }

-void InfEngineBackendNet::forward()
+void InfEngineBackendNet::InfEngineReqWrapper::makePromises(const std::vector<Ptr<BackendWrapper> >& outsWrappers)
 {
-    infRequest.Infer();
+    auto outs = infEngineWrappers(outsWrappers);
+    outProms.clear();
+    outProms.resize(outs.size());
+    outsNames.resize(outs.size());
+    for (int i = 0; i < outs.size(); ++i)
+    {
+        outs[i]->futureMat = outProms[i].get_future();
+        outsNames[i] = outs[i]->dataPtr->name;
+    }
+}
+
+void InfEngineBackendNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
+                                  bool isAsync)
+{
+    // Look for finished requests.
+    Ptr<InfEngineReqWrapper> reqWrapper;
+    for (auto& wrapper : infRequests)
+    {
+        if (wrapper->isReady)
+        {
+            reqWrapper = wrapper;
+            break;
+        }
+    }
+    if (reqWrapper.empty())
+    {
+        reqWrapper = Ptr<InfEngineReqWrapper>(new InfEngineReqWrapper());
+        try
+        {
+            reqWrapper->req = netExec.CreateInferRequest();
+        }
+        catch (const std::exception& ex)
+        {
+            CV_Error(Error::StsAssert, format("Failed to initialize Inference Engine backend: %s", ex.what()));
+        }
+        infRequests.push_back(reqWrapper);
+
+        InferenceEngine::BlobMap inpBlobs, outBlobs;
+        for (const auto& it : cnn.getInputsInfo())
+        {
+            const std::string& name = it.first;
+            auto blobIt = allBlobs.find(name);
+            CV_Assert(blobIt != allBlobs.end());
+            inpBlobs[name] = isAsync ? cloneBlob(blobIt->second) : blobIt->second;
+        }
+        for (const auto& it : cnn.getOutputsInfo())
+        {
+            const std::string& name = it.first;
+            auto blobIt = allBlobs.find(name);
+            CV_Assert(blobIt != allBlobs.end());
+            outBlobs[name] = isAsync ? cloneBlob(blobIt->second) : blobIt->second;
+        }
+        reqWrapper->req.SetInput(inpBlobs);
+        reqWrapper->req.SetOutput(outBlobs);
+
+        InferenceEngine::IInferRequest::Ptr infRequestPtr = reqWrapper->req;
+        infRequestPtr->SetUserData(reqWrapper.get(), 0);
+
+        infRequestPtr->SetCompletionCallback({
+            [](InferenceEngine::IInferRequest::Ptr request, InferenceEngine::StatusCode status)
+            {
+                InfEngineReqWrapper* wrapper;
+                request->GetUserData((void**)&wrapper, 0);
+                CV_Assert(wrapper);
+
+                for (int i = 0; i < wrapper->outProms.size(); ++i)
+                {
+                    const std::string& name = wrapper->outsNames[i];
+                    Mat m = infEngineBlobToMat(wrapper->req.GetBlob(name));
+
+                    if (status == InferenceEngine::StatusCode::OK)
+                        wrapper->outProms[i].set_value(m.clone());
+                    else
+                    {
+                        try {
+                            std::runtime_error e("Async request failed");
+                            wrapper->outProms[i].set_exception(std::make_exception_ptr(e));
+                        } catch(...) {
+                            CV_LOG_ERROR(NULL, "DNN: Exception occured during async inference exception propagation");
+                        }
+                    }
+                }
+                wrapper->isReady = true;
+            }
+        });
+    }
+    if (isAsync)
+    {
+        // Copy actual data to infer request's input blobs.
+        for (const auto& it : cnn.getInputsInfo())
+        {
+            const std::string& name = it.first;
+            auto blobIt = allBlobs.find(name);
+            Mat srcMat = infEngineBlobToMat(blobIt->second);
+            Mat dstMat = infEngineBlobToMat(reqWrapper->req.GetBlob(name));
+            srcMat.copyTo(dstMat);
+        }
+
+        // Set promises to output blobs wrappers.
+        reqWrapper->makePromises(outBlobsWrappers);
+
+        reqWrapper->isReady = false;
+        reqWrapper->req.StartAsync();
+    }
+    else
+    {
+        reqWrapper->req.Infer();
+    }
 }

 Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
@ -920,14 +1040,15 @@ bool haveInfEngine()
 #endif  // HAVE_INF_ENGINE
 }

-void forwardInfEngine(Ptr<BackendNode>& node)
+void forwardInfEngine(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
+                      Ptr<BackendNode>& node, bool isAsync)
 {
    CV_Assert(haveInfEngine());
 #ifdef HAVE_INF_ENGINE
    CV_Assert(!node.empty());
    Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
    CV_Assert(!ieNode.empty());
-    ieNode->net->forward();
+    ieNode->net->forward(outBlobsWrappers, isAsync);
 #endif  // HAVE_INF_ENGINE
 }

--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@ -185,7 +185,8 @@ public:

    void init(int targetId);

-    void forward();
+    void forward(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
+                 bool isAsync);

    void initPlugin(InferenceEngine::ICNNNetwork& net);

@ -197,12 +198,23 @@ private:
    InferenceEngine::InferenceEnginePluginPtr enginePtr;
    InferenceEngine::InferencePlugin plugin;
    InferenceEngine::ExecutableNetwork netExec;
-    InferenceEngine::InferRequest infRequest;
    InferenceEngine::BlobMap allBlobs;
-    InferenceEngine::BlobMap inpBlobs;
-    InferenceEngine::BlobMap outBlobs;
    InferenceEngine::TargetDevice targetDevice;

+    struct InfEngineReqWrapper
+    {
+        InfEngineReqWrapper() : isReady(true) {}
+
+        void makePromises(const std::vector<Ptr<BackendWrapper> >& outs);
+
+        InferenceEngine::InferRequest req;
+        std::vector<std::promise<Mat> > outProms;
+        std::vector<std::string> outsNames;
+        bool isReady;
+    };
+
+    std::vector<Ptr<InfEngineReqWrapper> > infRequests;
+
    InferenceEngine::CNNNetwork cnn;
    bool hasNetOwner;

@ -252,6 +264,7 @@ public:

    InferenceEngine::DataPtr dataPtr;
    InferenceEngine::Blob::Ptr blob;
+    std::future<Mat> futureMat;
 };

 InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout = InferenceEngine::Layout::ANY);
@ -302,7 +315,8 @@ CV__DNN_EXPERIMENTAL_NS_END

 bool haveInfEngine();

-void forwardInfEngine(Ptr<BackendNode>& node);
+void forwardInfEngine(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
+                      Ptr<BackendNode>& node, bool isAsync);

 }}  // namespace dnn, namespace cv

--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@ -340,4 +340,106 @@ TEST(Net, forwardAndRetrieve)
    normAssert(outBlobs[0][1], inp.rowRange(2, 4), "second part");
 }

+#ifdef HAVE_INF_ENGINE
+// This test runs network in synchronous mode for different inputs and then
+// runs the same model asynchronously for the same inputs.
+typedef testing::TestWithParam<Target> Async;
+TEST_P(Async, set_and_forward_single)
+{
+    static const int kTimeout = 5000;  // in milliseconds.
+    const int target = GetParam();
+
+    const std::string suffix = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+
+    Net netSync = readNet(model, proto);
+    netSync.setPreferableTarget(target);
+
+    Net netAsync = readNet(model, proto);
+    netAsync.setPreferableTarget(target);
+
+    // Generate inputs.
+    const int numInputs = 10;
+    std::vector<Mat> inputs(numInputs);
+    int blobSize[] = {2, 6, 75, 113};
+    for (int i = 0; i < numInputs; ++i)
+    {
+        inputs[i].create(4, &blobSize[0], CV_32FC1);
+        randu(inputs[i], 0.0f, 1.0f);
+    }
+
+    // Run synchronously.
+    std::vector<Mat> refs(numInputs);
+    for (int i = 0; i < numInputs; ++i)
+    {
+        netSync.setInput(inputs[i]);
+        refs[i] = netSync.forward().clone();
+    }
+
+    // Run asynchronously. To make test more robust, process inputs in the reversed order.
+    for (int i = numInputs - 1; i >= 0; --i)
+    {
+        netAsync.setInput(inputs[i]);
+
+        std::future<Mat> out = netAsync.forwardAsync();
+        if (out.wait_for(std::chrono::milliseconds(kTimeout)) == std::future_status::timeout)
+            CV_Error(Error::StsAssert, "Timeout");
+        normAssert(refs[i], out.get(), format("Index: %d", i).c_str(), 0, 0);
+    }
+}
+
+TEST_P(Async, set_and_forward_all)
+{
+    static const int kTimeout = 5000;  // in milliseconds.
+    const int target = GetParam();
+
+    const std::string suffix = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+
+
+    Net netSync = readNet(model, proto);
+    netSync.setPreferableTarget(target);
+
+    Net netAsync = readNet(model, proto);
+    netAsync.setPreferableTarget(target);
+
+    // Generate inputs.
+    const int numInputs = 10;
+    std::vector<Mat> inputs(numInputs);
+    int blobSize[] = {2, 6, 75, 113};
+    for (int i = 0; i < numInputs; ++i)
+    {
+        inputs[i].create(4, &blobSize[0], CV_32FC1);
+        randu(inputs[i], 0.0f, 1.0f);
+    }
+
+    // Run synchronously.
+    std::vector<Mat> refs(numInputs);
+    for (int i = 0; i < numInputs; ++i)
+    {
+        netSync.setInput(inputs[i]);
+        refs[i] = netSync.forward().clone();
+    }
+
+    // Run asynchronously. To make test more robust, process inputs in the reversed order.
+    std::vector<std::future<Mat> > outs(numInputs);
+    for (int i = numInputs - 1; i >= 0; --i)
+    {
+        netAsync.setInput(inputs[i]);
+        outs[i] = netAsync.forwardAsync();
+    }
+
+    for (int i = numInputs - 1; i >= 0; --i)
+    {
+        if (outs[i].wait_for(std::chrono::milliseconds(kTimeout)) == std::future_status::timeout)
+            CV_Error(Error::StsAssert, "Timeout");
+        normAssert(refs[i], outs[i].get(), format("Index: %d", i).c_str(), 0, 0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Async, testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)));
+#endif  // HAVE_INF_ENGINE
+
 }} // namespace