Merge pull request #13694 from dkurt:dnn_ie_async

Asynchronous API from Intel's Inference Engine (#13694) * Add forwardAsync for asynchronous mode from Intel's Inference Engine * Python test for forwardAsync * Replace Future_Mat to AsyncMat * Shadow AsyncMat * Isolate InferRequest callback * Manage exceptions in Async API of IE
2025-07-25 22:57:53 +08:00 · 2019-04-19 21:01:19 +03:00 · 2019-04-19 21:01:19 +03:00 · a5c92c2029
commit a5c92c2029
parent 3abae3c511
8 changed files with 503 additions and 82 deletions
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -44,11 +44,14 @@
 #include <vector>
 #include <opencv2/core.hpp>
 #ifdef CV_CXX11
 #include <future>
 #endif
 #if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_EXPERIMENTAL_NS
-#define CV__DNN_EXPERIMENTAL_NS_BEGIN namespace experimental_dnn_34_v11 {
+#define CV__DNN_EXPERIMENTAL_NS_BEGIN namespace experimental_dnn_34_v12 {
 #define CV__DNN_EXPERIMENTAL_NS_END }
-namespace cv { namespace dnn { namespace experimental_dnn_34_v11 { } using namespace experimental_dnn_34_v11; }}
+namespace cv { namespace dnn { namespace experimental_dnn_34_v12 { } using namespace experimental_dnn_34_v12; }}
 #else
 #define CV__DNN_EXPERIMENTAL_NS_BEGIN
 #define CV__DNN_EXPERIMENTAL_NS_END
@ -64,6 +67,18 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
    typedef std::vector<int> MatShape;
 #if defined(CV_CXX11) || defined(CV_DOXYGEN)
    typedef std::future<Mat> AsyncMat;
 #else
    // Just a workaround for bindings.
    struct AsyncMat
    {
        Mat get() { return Mat(); }
        void wait() const {}
        size_t wait_for(size_t milliseconds) const { CV_UNUSED(milliseconds); return -1; }
    };
 #endif
    /**
     * @brief Enum of computation backends supported by layers.
     * @see Net::setPreferableBackend
@ -75,7 +90,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        //! DNN_BACKEND_OPENCV otherwise.
        DNN_BACKEND_DEFAULT,
        DNN_BACKEND_HALIDE,
-        DNN_BACKEND_INFERENCE_ENGINE,
+        DNN_BACKEND_INFERENCE_ENGINE,  //!< Intel's Inference Engine computational backend.
        DNN_BACKEND_OPENCV
    };
@ -89,8 +104,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        DNN_TARGET_OPENCL,
        DNN_TARGET_OPENCL_FP16,
        DNN_TARGET_MYRIAD,
-        //! FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
+        DNN_TARGET_FPGA  //!< FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
        DNN_TARGET_FPGA
    };
    CV_EXPORTS std::vector< std::pair<Backend, Target> > getAvailableBackends();
@ -462,6 +476,15 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         */
        CV_WRAP Mat forward(const String& outputName = String());
        /** @brief Runs forward pass to compute output of layer with name @p outputName.
         *  @param outputName name for layer which output is needed to get
         *  @details By default runs forward pass for the whole network.
         *
         *  This is an asynchronous version of forward(const String&).
         *  dnn::DNN_BACKEND_INFERENCE_ENGINE backend is required.
         */
        CV_WRAP AsyncMat forwardAsync(const String& outputName = String());
        /** @brief Runs forward pass to compute output of layer with name @p outputName.
         *  @param outputBlobs contains all output blobs for specified layer.
         *  @param outputName name for layer which output is needed to get
--- a/modules/dnn/misc/python/pyopencv_dnn.hpp
+++ b/modules/dnn/misc/python/pyopencv_dnn.hpp
@ -2,7 +2,13 @@
 typedef dnn::DictValue LayerId;
 typedef std::vector<dnn::MatShape> vector_MatShape;
 typedef std::vector<std::vector<dnn::MatShape> > vector_vector_MatShape;
-
+#ifdef CV_CXX11
 typedef std::chrono::milliseconds chrono_milliseconds;
 typedef std::future_status AsyncMatStatus;
 #else
 typedef size_t chrono_milliseconds;
 typedef size_t AsyncMatStatus;
 #endif
 template<>
 bool pyopencv_to(PyObject *o, dnn::DictValue &dv, const char *name)
@ -40,6 +46,46 @@ bool pyopencv_to(PyObject *o, std::vector<Mat> &blobs, const char *name) //requi
  return pyopencvVecConverter<Mat>::to(o, blobs, ArgInfo(name, false));
 }
 #ifdef CV_CXX11
 template<>
 PyObject* pyopencv_from(const std::future<Mat>& f_)
 {
    std::future<Mat>& f = const_cast<std::future<Mat>&>(f_);
    Ptr<cv::dnn::AsyncMat> p(new std::future<Mat>(std::move(f)));
    return pyopencv_from(p);
 }
 template<>
 PyObject* pyopencv_from(const std::future_status& status)
 {
    return pyopencv_from((int)status);
 }
 template<>
 bool pyopencv_to(PyObject* src, std::chrono::milliseconds& dst, const char* name)
 {
    size_t millis = 0;
    if (pyopencv_to(src, millis, name))
    {
        dst = std::chrono::milliseconds(millis);
        return true;
    }
    else
        return false;
 }
 #else
 template<>
 PyObject* pyopencv_from(const cv::dnn::AsyncMat&)
 {
    CV_Error(Error::StsNotImplemented, "C++11 is required.");
    return 0;
 }
 #endif  // CV_CXX11
 template<typename T>
 PyObject* pyopencv_from(const dnn::DictValue &dv)
 {
--- a/modules/dnn/misc/python/shadow_async_mat.hpp
+++ b/modules/dnn/misc/python/shadow_async_mat.hpp
@ -0,0 +1,22 @@
 #error This is a shadow header file, which is not intended for processing by any compiler. \
       Only bindings parser should handle this file.
 namespace cv { namespace dnn {
 class CV_EXPORTS_W AsyncMat
 {
 public:
    //! Wait for Mat object readiness and return it.
    CV_WRAP Mat get();
    //! Wait for Mat object readiness.
    CV_WRAP void wait() const;
    /** @brief Wait for Mat object readiness specific amount of time.
     *  @param timeout Timeout in milliseconds
     *  @returns [std::future_status](https://en.cppreference.com/w/cpp/thread/future_status)
     */
    CV_WRAP AsyncMatStatus wait_for(std::chrono::milliseconds timeout) const;
 };
 }}
--- a/modules/dnn/misc/python/test/test_dnn.py
+++ b/modules/dnn/misc/python/test/test_dnn.py
@ -5,8 +5,8 @@ import numpy as np
 from tests_common import NewOpenCVTests, unittest
-def normAssert(test, a, b, lInf=1e-5):
+def normAssert(test, a, b, msg=None, lInf=1e-5):
-    test.assertLess(np.max(np.abs(a - b)), lInf)
+    test.assertLess(np.max(np.abs(a - b)), lInf, msg)
 def inter_area(box1, box2):
    x_min, x_max = max(box1[0], box2[0]), min(box1[2], box2[2])
@ -53,53 +53,6 @@ def normAssertDetections(test, ref, out, confThreshold=0.0, scores_diff=1e-5, bo
    if errMsg:
        test.fail(errMsg)
 # Returns a simple one-layer network created from Caffe's format
 def getSimpleNet():
    prototxt = """
        name: "simpleNet"
        input: "data"
        layer {
          type: "Identity"
          name: "testLayer"
          top: "testLayer"
          bottom: "data"
        }
    """
    return cv.dnn.readNetFromCaffe(bytearray(prototxt, 'utf8'))
 def testBackendAndTarget(backend, target):
    net = getSimpleNet()
    net.setPreferableBackend(backend)
    net.setPreferableTarget(target)
    inp = np.random.standard_normal([1, 2, 3, 4]).astype(np.float32)
    try:
        net.setInput(inp)
        net.forward()
    except BaseException as e:
        return False
    return True
 haveInfEngine = testBackendAndTarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU)
 dnnBackendsAndTargets = [
    [cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU],
 ]
 if haveInfEngine:
    dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU])
    if testBackendAndTarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD):
        dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD])
 if cv.ocl.haveOpenCL() and cv.ocl.useOpenCL():
    dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL])
    dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL_FP16])
    if haveInfEngine and cv.ocl_Device.getDefault().isIntel():
        dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL])
        dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16])
 def printParams(backend, target):
    backendNames = {
        cv.dnn.DNN_BACKEND_OPENCV: 'OCV',
@ -116,8 +69,44 @@ def printParams(backend, target):
 class dnn_test(NewOpenCVTests):
    def __init__(self, *args, **kwargs):
        super(dnn_test, self).__init__(*args, **kwargs)
        self.dnnBackendsAndTargets = [
            [cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU],
        ]
        if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU):
            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU])
        if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD):
            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD])
        if cv.ocl.haveOpenCL() and cv.ocl.useOpenCL():
            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL])
            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL_FP16])
            if cv.ocl_Device.getDefault().isIntel():
                if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL):
                    self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL])
                if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16):
                    self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16])
    def find_dnn_file(self, filename, required=True):
-        return self.find_file(filename, [os.environ.get('OPENCV_DNN_TEST_DATA_PATH', os.getcwd())], required=required)
+        return self.find_file(filename, [os.environ.get('OPENCV_DNN_TEST_DATA_PATH', os.getcwd()),
                                         os.environ['OPENCV_TEST_DATA_PATH']],
                              required=required)
    def checkIETarget(self, backend, target):
        proto = self.find_dnn_file('dnn/layers/layer_convolution.prototxt', required=True)
        model = self.find_dnn_file('dnn/layers/layer_convolution.caffemodel', required=True)
        net = cv.dnn.readNet(proto, model)
        net.setPreferableBackend(backend)
        net.setPreferableTarget(target)
        inp = np.random.standard_normal([1, 2, 10, 11]).astype(np.float32)
        try:
            net.setInput(inp)
            net.forward()
        except BaseException as e:
            return False
        return True
    def test_blobFromImage(self):
        np.random.seed(324)
@ -148,7 +137,7 @@ class dnn_test(NewOpenCVTests):
    def test_face_detection(self):
        testdata_required = bool(os.environ.get('OPENCV_DNN_TEST_REQUIRE_TESTDATA', False))
-        proto = self.find_dnn_file('dnn/opencv_face_detector.prototxt2', required=testdata_required)
+        proto = self.find_dnn_file('dnn/opencv_face_detector.prototxt', required=testdata_required)
        model = self.find_dnn_file('dnn/opencv_face_detector.caffemodel', required=testdata_required)
        if proto is None or model is None:
            raise unittest.SkipTest("Missing DNN test files (dnn/opencv_face_detector.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
@ -164,7 +153,7 @@ class dnn_test(NewOpenCVTests):
               [0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427,  0.5347801]]
        print('\n')
-        for backend, target in dnnBackendsAndTargets:
+        for backend, target in self.dnnBackendsAndTargets:
            printParams(backend, target)
            net = cv.dnn.readNet(proto, model)
@ -178,5 +167,52 @@ class dnn_test(NewOpenCVTests):
            normAssertDetections(self, ref, out, 0.5, scoresDiff, iouDiff)
    def test_async(self):
        timeout = 5000  # in milliseconds
        testdata_required = bool(os.environ.get('OPENCV_DNN_TEST_REQUIRE_TESTDATA', False))
        proto = self.find_dnn_file('dnn/layers/layer_convolution.prototxt', required=testdata_required)
        model = self.find_dnn_file('dnn/layers/layer_convolution.caffemodel', required=testdata_required)
        if proto is None or model is None:
            raise unittest.SkipTest("Missing DNN test files (dnn/layers/layer_convolution.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
        print('\n')
        for backend, target in self.dnnBackendsAndTargets:
            if backend != cv.dnn.DNN_BACKEND_INFERENCE_ENGINE:
                continue
            printParams(backend, target)
            netSync = cv.dnn.readNet(proto, model)
            netSync.setPreferableBackend(backend)
            netSync.setPreferableTarget(target)
            netAsync = cv.dnn.readNet(proto, model)
            netAsync.setPreferableBackend(backend)
            netAsync.setPreferableTarget(target)
            # Generate inputs
            numInputs = 10
            inputs = []
            for _ in range(numInputs):
                inputs.append(np.random.standard_normal([2, 6, 75, 113]).astype(np.float32))
            # Run synchronously
            refs = []
            for i in range(numInputs):
                netSync.setInput(inputs[i])
                refs.append(netSync.forward())
            # Run asynchronously. To make test more robust, process inputs in the reversed order.
            outs = []
            for i in reversed(range(numInputs)):
                netAsync.setInput(inputs[i])
                outs.insert(0, netAsync.forwardAsync())
            for i in reversed(range(numInputs)):
                if outs[i].wait_for(timeout) == 1:
                    self.fail("Timeout")
                normAssert(self, refs[i], outs[i].get(), 'Index: %d' % i, 1e-10)
 if __name__ == '__main__':
    NewOpenCVTests.bootstrap()
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -1030,6 +1030,7 @@ struct Net::Impl
        lastLayerId = 0;
        netWasAllocated = false;
        fusion = true;
        isAsync = false;
        preferableBackend = DNN_BACKEND_DEFAULT;
        preferableTarget = DNN_TARGET_CPU;
        skipInfEngineInit = false;
@ -1051,6 +1052,7 @@ struct Net::Impl
    bool netWasAllocated;
    bool fusion;
    bool isAsync;
    std::vector<int64> layersTimings;
    Mat output_blob;
@ -2258,6 +2260,9 @@ struct Net::Impl
            std::map<int, Ptr<BackendNode> >::iterator it = ld.backendNodes.find(preferableBackend);
            if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty())
            {
                if (isAsync)
                    CV_Error(Error::StsNotImplemented, "Default implementation fallbacks in asynchronous mode");
                if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
                {
                    std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
@ -2413,7 +2418,7 @@ struct Net::Impl
                }
                else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
                {
-                    forwardInfEngine(node);
+                    forwardInfEngine(ld.outputBlobsWrappers, node, isAsync);
                }
                else
                {
@ -2459,15 +2464,6 @@ struct Net::Impl
        forwardLayer(ld);
    }
    void forwardAll()
    {
        CV_TRACE_FUNCTION();
        MapIdToLayerData::reverse_iterator last_layer = layers.rbegin();
        CV_Assert(last_layer != layers.rend());
        forwardToLayer(last_layer->second, true);
    }
    void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
    {
        std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
@ -2558,6 +2554,42 @@ struct Net::Impl
    {
        return getBlob(getPinByAlias(outputName));
    }
 #ifdef CV_CXX11
    std::future<Mat> getBlobAsync(const LayerPin& pin)
    {
        CV_TRACE_FUNCTION();
 #ifdef HAVE_INF_ENGINE
        if (!pin.valid())
            CV_Error(Error::StsObjectNotFound, "Requested blob not found");
        LayerData &ld = layers[pin.lid];
        if ((size_t)pin.oid >= ld.outputBlobs.size())
        {
            CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %d outputs, "
                                           "the #%d was requested", ld.name.c_str(),
                                           ld.outputBlobs.size(), pin.oid));
        }
        if (preferableTarget != DNN_TARGET_CPU)
        {
            CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
            // Transfer data to CPU if it's require.
            ld.outputBlobsWrappers[pin.oid]->copyToHost();
        }
        CV_Assert(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE);
        Ptr<InfEngineBackendWrapper> wrapper = ld.outputBlobsWrappers[pin.oid].dynamicCast<InfEngineBackendWrapper>();
        return std::move(wrapper->futureMat);
 #else
        CV_Error(Error::StsNotImplemented, "DNN_BACKEND_INFERENCE_ENGINE backend is required");
 #endif
    }
    std::future<Mat> getBlobAsync(String outputName)
    {
        return getBlobAsync(getPinByAlias(outputName));
    }
 #endif  // CV_CXX11
 };
 Net::Net() : impl(new Net::Impl)
@ -2681,6 +2713,31 @@ Mat Net::forward(const String& outputName)
    return impl->getBlob(layerName);
 }
 AsyncMat Net::forwardAsync(const String& outputName)
 {
    CV_TRACE_FUNCTION();
 #ifdef CV_CXX11
    if (impl->preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
        CV_Error(Error::StsNotImplemented, "Asynchronous forward for backend which is different from DNN_BACKEND_INFERENCE_ENGINE");
    String layerName = outputName;
    if (layerName.empty())
        layerName = getLayerNames().back();
    std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
    impl->setUpNet(pins);
    impl->isAsync = true;
    impl->forwardToLayer(impl->getLayerData(layerName));
    impl->isAsync = false;
    return impl->getBlobAsync(layerName);
 #else
    CV_Error(Error::StsNotImplemented, "Asynchronous forward without C++11");
 #endif  // CV_CXX11
 }
 void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
 {
    CV_TRACE_FUNCTION();
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@ -168,7 +168,6 @@ void InfEngineBackendNet::init(int targetId)
        const std::string& name = it.first;
        auto blobIt = allBlobs.find(name);
        CV_Assert(blobIt != allBlobs.end());
        inpBlobs[name] = blobIt->second;
        it.second->setPrecision(blobIt->second->precision());
    }
    for (const auto& it : cnn.getOutputsInfo())
@ -176,7 +175,6 @@ void InfEngineBackendNet::init(int targetId)
        const std::string& name = it.first;
        auto blobIt = allBlobs.find(name);
        CV_Assert(blobIt != allBlobs.end());
        outBlobs[name] = blobIt->second;
        it.second->setPrecision(blobIt->second->precision());  // Should be always FP32
    }
@ -288,6 +286,24 @@ InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::La
    return wrapToInfEngineBlob(m, reversedShape, layout);
 }
 InferenceEngine::Blob::Ptr cloneBlob(const InferenceEngine::Blob::Ptr& blob)
 {
    InferenceEngine::Precision precision = blob->precision();
    InferenceEngine::Blob::Ptr copy;
    if (precision == InferenceEngine::Precision::FP32)
    {
        copy = InferenceEngine::make_shared_blob<float>(precision, blob->layout(), blob->dims());
    }
    else if (precision == InferenceEngine::Precision::U8)
    {
        copy = InferenceEngine::make_shared_blob<uint8_t>(precision, blob->layout(), blob->dims());
    }
    else
        CV_Error(Error::StsNotImplemented, "Unsupported blob precision");
    copy->allocate();
    return copy;
 }
 InferenceEngine::DataPtr infEngineDataNode(const Ptr<BackendWrapper>& ptr)
 {
    CV_Assert(!ptr.empty());
@ -800,9 +816,6 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
        plugin = InferenceEngine::InferencePlugin(enginePtr);
        netExec = plugin.LoadNetwork(net, {});
        infRequest = netExec.CreateInferRequest();
        infRequest.SetInput(inpBlobs);
        infRequest.SetOutput(outBlobs);
    }
    catch (const std::exception& ex)
    {
@ -828,9 +841,116 @@ void InfEngineBackendNet::addBlobs(const std::vector<Ptr<BackendWrapper> >& ptrs
    }
 }
-void InfEngineBackendNet::forward()
+void InfEngineBackendNet::InfEngineReqWrapper::makePromises(const std::vector<Ptr<BackendWrapper> >& outsWrappers)
 {
-    infRequest.Infer();
+    auto outs = infEngineWrappers(outsWrappers);
    outProms.clear();
    outProms.resize(outs.size());
    outsNames.resize(outs.size());
    for (int i = 0; i < outs.size(); ++i)
    {
        outs[i]->futureMat = outProms[i].get_future();
        outsNames[i] = outs[i]->dataPtr->name;
    }
 }
 void InfEngineBackendNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
                                  bool isAsync)
 {
    // Look for finished requests.
    Ptr<InfEngineReqWrapper> reqWrapper;
    for (auto& wrapper : infRequests)
    {
        if (wrapper->isReady)
        {
            reqWrapper = wrapper;
            break;
        }
    }
    if (reqWrapper.empty())
    {
        reqWrapper = Ptr<InfEngineReqWrapper>(new InfEngineReqWrapper());
        try
        {
            reqWrapper->req = netExec.CreateInferRequest();
        }
        catch (const std::exception& ex)
        {
            CV_Error(Error::StsAssert, format("Failed to initialize Inference Engine backend: %s", ex.what()));
        }
        infRequests.push_back(reqWrapper);
        InferenceEngine::BlobMap inpBlobs, outBlobs;
        for (const auto& it : cnn.getInputsInfo())
        {
            const std::string& name = it.first;
            auto blobIt = allBlobs.find(name);
            CV_Assert(blobIt != allBlobs.end());
            inpBlobs[name] = isAsync ? cloneBlob(blobIt->second) : blobIt->second;
        }
        for (const auto& it : cnn.getOutputsInfo())
        {
            const std::string& name = it.first;
            auto blobIt = allBlobs.find(name);
            CV_Assert(blobIt != allBlobs.end());
            outBlobs[name] = isAsync ? cloneBlob(blobIt->second) : blobIt->second;
        }
        reqWrapper->req.SetInput(inpBlobs);
        reqWrapper->req.SetOutput(outBlobs);
        InferenceEngine::IInferRequest::Ptr infRequestPtr = reqWrapper->req;
        infRequestPtr->SetUserData(reqWrapper.get(), 0);
        infRequestPtr->SetCompletionCallback({
            [](InferenceEngine::IInferRequest::Ptr request, InferenceEngine::StatusCode status)
            {
                InfEngineReqWrapper* wrapper;
                request->GetUserData((void**)&wrapper, 0);
                CV_Assert(wrapper);
                for (int i = 0; i < wrapper->outProms.size(); ++i)
                {
                    const std::string& name = wrapper->outsNames[i];
                    Mat m = infEngineBlobToMat(wrapper->req.GetBlob(name));
                    if (status == InferenceEngine::StatusCode::OK)
                        wrapper->outProms[i].set_value(m.clone());
                    else
                    {
                        try {
                            std::runtime_error e("Async request failed");
                            wrapper->outProms[i].set_exception(std::make_exception_ptr(e));
                        } catch(...) {
                            CV_LOG_ERROR(NULL, "DNN: Exception occured during async inference exception propagation");
                        }
                    }
                }
                wrapper->isReady = true;
            }
        });
    }
    if (isAsync)
    {
        // Copy actual data to infer request's input blobs.
        for (const auto& it : cnn.getInputsInfo())
        {
            const std::string& name = it.first;
            auto blobIt = allBlobs.find(name);
            Mat srcMat = infEngineBlobToMat(blobIt->second);
            Mat dstMat = infEngineBlobToMat(reqWrapper->req.GetBlob(name));
            srcMat.copyTo(dstMat);
        }
        // Set promises to output blobs wrappers.
        reqWrapper->makePromises(outBlobsWrappers);
        reqWrapper->isReady = false;
        reqWrapper->req.StartAsync();
    }
    else
    {
        reqWrapper->req.Infer();
    }
 }
 Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
@ -920,14 +1040,15 @@ bool haveInfEngine()
 #endif  // HAVE_INF_ENGINE
 }
-void forwardInfEngine(Ptr<BackendNode>& node)
+void forwardInfEngine(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
                      Ptr<BackendNode>& node, bool isAsync)
 {
    CV_Assert(haveInfEngine());
 #ifdef HAVE_INF_ENGINE
    CV_Assert(!node.empty());
    Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
    CV_Assert(!ieNode.empty());
-    ieNode->net->forward();
+    ieNode->net->forward(outBlobsWrappers, isAsync);
 #endif  // HAVE_INF_ENGINE
 }
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@ -185,7 +185,8 @@ public:
    void init(int targetId);
-    void forward();
+    void forward(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
                 bool isAsync);
    void initPlugin(InferenceEngine::ICNNNetwork& net);
@ -197,12 +198,23 @@ private:
    InferenceEngine::InferenceEnginePluginPtr enginePtr;
    InferenceEngine::InferencePlugin plugin;
    InferenceEngine::ExecutableNetwork netExec;
    InferenceEngine::InferRequest infRequest;
    InferenceEngine::BlobMap allBlobs;
    InferenceEngine::BlobMap inpBlobs;
    InferenceEngine::BlobMap outBlobs;
    InferenceEngine::TargetDevice targetDevice;
    struct InfEngineReqWrapper
    {
        InfEngineReqWrapper() : isReady(true) {}
        void makePromises(const std::vector<Ptr<BackendWrapper> >& outs);
        InferenceEngine::InferRequest req;
        std::vector<std::promise<Mat> > outProms;
        std::vector<std::string> outsNames;
        bool isReady;
    };
    std::vector<Ptr<InfEngineReqWrapper> > infRequests;
    InferenceEngine::CNNNetwork cnn;
    bool hasNetOwner;
@ -252,6 +264,7 @@ public:
    InferenceEngine::DataPtr dataPtr;
    InferenceEngine::Blob::Ptr blob;
    std::future<Mat> futureMat;
 };
 InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout = InferenceEngine::Layout::ANY);
@ -302,7 +315,8 @@ CV__DNN_EXPERIMENTAL_NS_END
 bool haveInfEngine();
-void forwardInfEngine(Ptr<BackendNode>& node);
+void forwardInfEngine(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
                      Ptr<BackendNode>& node, bool isAsync);
 }}  // namespace dnn, namespace cv
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@ -340,4 +340,106 @@ TEST(Net, forwardAndRetrieve)
    normAssert(outBlobs[0][1], inp.rowRange(2, 4), "second part");
 }
 #ifdef HAVE_INF_ENGINE
 // This test runs network in synchronous mode for different inputs and then
 // runs the same model asynchronously for the same inputs.
 typedef testing::TestWithParam<Target> Async;
 TEST_P(Async, set_and_forward_single)
 {
    static const int kTimeout = 5000;  // in milliseconds.
    const int target = GetParam();
    const std::string suffix = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "_fp16" : "";
    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
    Net netSync = readNet(model, proto);
    netSync.setPreferableTarget(target);
    Net netAsync = readNet(model, proto);
    netAsync.setPreferableTarget(target);
    // Generate inputs.
    const int numInputs = 10;
    std::vector<Mat> inputs(numInputs);
    int blobSize[] = {2, 6, 75, 113};
    for (int i = 0; i < numInputs; ++i)
    {
        inputs[i].create(4, &blobSize[0], CV_32FC1);
        randu(inputs[i], 0.0f, 1.0f);
    }
    // Run synchronously.
    std::vector<Mat> refs(numInputs);
    for (int i = 0; i < numInputs; ++i)
    {
        netSync.setInput(inputs[i]);
        refs[i] = netSync.forward().clone();
    }
    // Run asynchronously. To make test more robust, process inputs in the reversed order.
    for (int i = numInputs - 1; i >= 0; --i)
    {
        netAsync.setInput(inputs[i]);
        std::future<Mat> out = netAsync.forwardAsync();
        if (out.wait_for(std::chrono::milliseconds(kTimeout)) == std::future_status::timeout)
            CV_Error(Error::StsAssert, "Timeout");
        normAssert(refs[i], out.get(), format("Index: %d", i).c_str(), 0, 0);
    }
 }
 TEST_P(Async, set_and_forward_all)
 {
    static const int kTimeout = 5000;  // in milliseconds.
    const int target = GetParam();
    const std::string suffix = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "_fp16" : "";
    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
    Net netSync = readNet(model, proto);
    netSync.setPreferableTarget(target);
    Net netAsync = readNet(model, proto);
    netAsync.setPreferableTarget(target);
    // Generate inputs.
    const int numInputs = 10;
    std::vector<Mat> inputs(numInputs);
    int blobSize[] = {2, 6, 75, 113};
    for (int i = 0; i < numInputs; ++i)
    {
        inputs[i].create(4, &blobSize[0], CV_32FC1);
        randu(inputs[i], 0.0f, 1.0f);
    }
    // Run synchronously.
    std::vector<Mat> refs(numInputs);
    for (int i = 0; i < numInputs; ++i)
    {
        netSync.setInput(inputs[i]);
        refs[i] = netSync.forward().clone();
    }
    // Run asynchronously. To make test more robust, process inputs in the reversed order.
    std::vector<std::future<Mat> > outs(numInputs);
    for (int i = numInputs - 1; i >= 0; --i)
    {
        netAsync.setInput(inputs[i]);
        outs[i] = netAsync.forwardAsync();
    }
    for (int i = numInputs - 1; i >= 0; --i)
    {
        if (outs[i].wait_for(std::chrono::milliseconds(kTimeout)) == std::future_status::timeout)
            CV_Error(Error::StsAssert, "Timeout");
        normAssert(refs[i], outs[i].get(), format("Index: %d", i).c_str(), 0, 0);
    }
 }
 INSTANTIATE_TEST_CASE_P(/**/, Async, testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)));
 #endif  // HAVE_INF_ENGINE
 }} // namespace