mirror of
https://github.com/opencv/opencv.git
synced 2025-08-06 06:26:29 +08:00
Merge pull request #16039 from dmatveev:dm/gapi_tutorial_interactive_face_detection
* G-API-NG/Docs: Added a tutorial page on interactive face detection sample - Introduced a "--ser" option to run the pipeline serially for benchmarking purposes - Reorganized sample code to better fit the documentation; - Fixed a couple of issues (mainly typos) in the public headers * G-API-NG/Docs: Reflected meta-less compilation in new G-API tutorial * G-API-NG/Docs: Addressed review comments on Face Analytics Pipeline example
This commit is contained in:
parent
3fddd3bf93
commit
c89780dfe0
@ -287,7 +287,7 @@ CALLER_GRAPH = NO
|
|||||||
GRAPHICAL_HIERARCHY = YES
|
GRAPHICAL_HIERARCHY = YES
|
||||||
DIRECTORY_GRAPH = YES
|
DIRECTORY_GRAPH = YES
|
||||||
DOT_IMAGE_FORMAT = svg
|
DOT_IMAGE_FORMAT = svg
|
||||||
INTERACTIVE_SVG = YES
|
INTERACTIVE_SVG = NO
|
||||||
DOT_PATH =
|
DOT_PATH =
|
||||||
DOTFILE_DIRS =
|
DOTFILE_DIRS =
|
||||||
MSCFILE_DIRS =
|
MSCFILE_DIRS =
|
||||||
|
@ -0,0 +1,353 @@
|
|||||||
|
# Face analytics pipeline with G-API {#tutorial_gapi_interactive_face_detection}
|
||||||
|
|
||||||
|
[TOC]
|
||||||
|
|
||||||
|
# Overview {#gapi_ifd_intro}
|
||||||
|
|
||||||
|
In this tutorial you will learn:
|
||||||
|
* How to integrate Deep Learning inference in a G-API graph;
|
||||||
|
* How to run a G-API graph on a video stream and obtain data from it.
|
||||||
|
|
||||||
|
# Prerequisites {#gapi_ifd_prereq}
|
||||||
|
|
||||||
|
This sample requires:
|
||||||
|
- PC with GNU/Linux or Microsoft Windows (Apple macOS is supported but
|
||||||
|
was not tested);
|
||||||
|
- OpenCV 4.2 or later built with Intel® Distribution of [OpenVINO™
|
||||||
|
Toolkit](https://docs.openvinotoolkit.org/) (building with [Intel®
|
||||||
|
TBB](https://www.threadingbuildingblocks.org/intel-tbb-tutorial) is
|
||||||
|
a plus);
|
||||||
|
- The following topologies from OpenVINO™ Toolkit [Open Model
|
||||||
|
Zoo](https://github.com/opencv/open_model_zoo):
|
||||||
|
- `face-detection-adas-0001`;
|
||||||
|
- `age-gender-recognition-retail-0013`;
|
||||||
|
- `emotions-recognition-retail-0003`.
|
||||||
|
|
||||||
|
# Introduction: why G-API {#gapi_ifd_why}
|
||||||
|
|
||||||
|
Many computer vision algorithms run on a video stream rather than on
|
||||||
|
individual images. Stream processing usually consists of multiple
|
||||||
|
steps -- like decode, preprocessing, detection, tracking,
|
||||||
|
classification (on detected objects), and visualization -- forming a
|
||||||
|
*video processing pipeline*. Moreover, many these steps of such
|
||||||
|
pipeline can run in parallel -- modern platforms have different
|
||||||
|
hardware blocks on the same chip like decoders and GPUs, and extra
|
||||||
|
accelerators can be plugged in as extensions, like Intel® Movidius™
|
||||||
|
Neural Compute Stick for deep learning offload.
|
||||||
|
|
||||||
|
Given all this manifold of options and a variety in video analytics
|
||||||
|
algorithms, managing such pipelines effectively quickly becomes a
|
||||||
|
problem. For sure it can be done manually, but this approach doesn't
|
||||||
|
scale: if a change is required in the algorithm (e.g. a new pipeline
|
||||||
|
step is added), or if it is ported on a new platform with different
|
||||||
|
capabilities, the whole pipeline needs to be re-optimized.
|
||||||
|
|
||||||
|
Starting with version 4.2, OpenCV offers a solution to this
|
||||||
|
problem. OpenCV G-API now can manage Deep Learning inference (a
|
||||||
|
cornerstone of any modern analytics pipeline) with a traditional
|
||||||
|
Computer Vision as well as video capturing/decoding, all in a single
|
||||||
|
pipeline. G-API takes care of pipelining itself -- so if the algorithm
|
||||||
|
or platform changes, the execution model adapts to it automatically.
|
||||||
|
|
||||||
|
# Pipeline overview {#gapi_ifd_overview}
|
||||||
|
|
||||||
|
Our sample application is based on ["Interactive Face Detection"] demo
|
||||||
|
from OpenVINO™ Toolkit Open Model Zoo. A simplified pipeline consists
|
||||||
|
of the following steps:
|
||||||
|
1. Image acquisition and decode;
|
||||||
|
2. Detection with preprocessing;
|
||||||
|
3. Classification with preprocessing for every detected object with
|
||||||
|
two networks;
|
||||||
|
4. Visualization.
|
||||||
|
|
||||||
|
\dot
|
||||||
|
digraph pipeline {
|
||||||
|
node [shape=record fontname=Helvetica fontsize=10 style=filled color="#4c7aa4" fillcolor="#5b9bd5" fontcolor="white"];
|
||||||
|
edge [color="#62a8e7"];
|
||||||
|
splines=ortho;
|
||||||
|
|
||||||
|
rankdir = LR;
|
||||||
|
subgraph cluster_0 {
|
||||||
|
color=invis;
|
||||||
|
capture [label="Capture\nDecode"];
|
||||||
|
resize [label="Resize\nConvert"];
|
||||||
|
detect [label="Detect faces"];
|
||||||
|
capture -> resize -> detect
|
||||||
|
}
|
||||||
|
|
||||||
|
subgraph cluster_1 {
|
||||||
|
graph[style=dashed];
|
||||||
|
|
||||||
|
subgraph cluster_2 {
|
||||||
|
color=invis;
|
||||||
|
temp_4 [style=invis shape=point width=0];
|
||||||
|
postproc_1 [label="Crop\nResize\nConvert"];
|
||||||
|
age_gender [label="Classify\nAge/gender"];
|
||||||
|
postproc_1 -> age_gender [constraint=true]
|
||||||
|
temp_4 -> postproc_1 [constraint=none]
|
||||||
|
}
|
||||||
|
|
||||||
|
subgraph cluster_3 {
|
||||||
|
color=invis;
|
||||||
|
postproc_2 [label="Crop\nResize\nConvert"];
|
||||||
|
emo [label="Classify\nEmotions"];
|
||||||
|
postproc_2 -> emo [constraint=true]
|
||||||
|
}
|
||||||
|
label="(for each face)";
|
||||||
|
}
|
||||||
|
|
||||||
|
temp_1 [style=invis shape=point width=0];
|
||||||
|
temp_2 [style=invis shape=point width=0];
|
||||||
|
detect -> temp_1 [arrowhead=none]
|
||||||
|
temp_1 -> postproc_1
|
||||||
|
|
||||||
|
capture -> {temp_4, temp_2} [arrowhead=none constraint=false]
|
||||||
|
temp_2 -> postproc_2
|
||||||
|
|
||||||
|
temp_1 -> temp_2 [arrowhead=none constraint=false]
|
||||||
|
|
||||||
|
temp_3 [style=invis shape=point width=0];
|
||||||
|
show [label="Visualize\nDisplay"];
|
||||||
|
|
||||||
|
{age_gender, emo} -> temp_3 [arrowhead=none]
|
||||||
|
temp_3 -> show
|
||||||
|
}
|
||||||
|
\enddot
|
||||||
|
|
||||||
|
# Constructing a pipeline {#gapi_ifd_constructing}
|
||||||
|
|
||||||
|
Constructing a G-API graph for a video streaming case does not differ
|
||||||
|
much from a [regular usage](@ref gapi_example) of G-API -- it is still
|
||||||
|
about defining graph *data* (with cv::GMat, cv::GScalar, and
|
||||||
|
cv::GArray) and *operations* over it. Inference also becomes an
|
||||||
|
operation in the graph, but is defined in a little bit different way.
|
||||||
|
|
||||||
|
## Declaring Deep Learning topologies {#gapi_ifd_declaring_nets}
|
||||||
|
|
||||||
|
In contrast with traditional CV functions (see [core] and [imgproc])
|
||||||
|
where G-API declares distinct operations for every function, inference
|
||||||
|
in G-API is a single generic operation cv::gapi::infer<>. As usual, it
|
||||||
|
is just an interface and it can be implemented in a number of ways under
|
||||||
|
the hood. In OpenCV 4.2, only OpenVINO™ Inference Engine-based backend
|
||||||
|
is available, and OpenCV's own DNN module-based backend is to come.
|
||||||
|
|
||||||
|
cv::gapi::infer<> is _parametrized_ by the details of a topology we are
|
||||||
|
going to execute. Like operations, topologies in G-API are strongly
|
||||||
|
typed and are defined with a special macro G_API_NET():
|
||||||
|
|
||||||
|
@snippet cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp G_API_NET
|
||||||
|
|
||||||
|
Similar to how operations are defined with G_API_OP(), network
|
||||||
|
description requires three parameters:
|
||||||
|
1. A type name. Every defined topology is declared as a distinct C++
|
||||||
|
type which is used further in the program -- see below;
|
||||||
|
2. A `std::function<>`-like API signature. G-API traits networks as
|
||||||
|
regular "functions" which take and return data. Here network
|
||||||
|
`Faces` (a detector) takes a cv::GMat and returns a cv::GMat, while
|
||||||
|
network `AgeGender` is known to provide two outputs (age and gender
|
||||||
|
blobs, respecitvely) -- so its has a `std::tuple<>` as a return
|
||||||
|
type.
|
||||||
|
3. A topology name -- can be any non-empty string, G-API is using
|
||||||
|
these names to distinguish networks inside. Names should be unique
|
||||||
|
in the scope of a single graph.
|
||||||
|
|
||||||
|
## Building a GComputation {#gapi_ifd_gcomputation}
|
||||||
|
|
||||||
|
Now the above pipeline is expressed in G-API like this:
|
||||||
|
|
||||||
|
@snippet cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp GComputation
|
||||||
|
|
||||||
|
Every pipeline starts with declaring empty data objects -- which act
|
||||||
|
as inputs to the pipeline. Then we call a generic cv::gapi::infer<>
|
||||||
|
specialized to `Faces` detection network. cv::gapi::infer<> inherits its
|
||||||
|
signature from its template parameter -- and in this case it expects
|
||||||
|
one input cv::GMat and produces one output cv::GMat.
|
||||||
|
|
||||||
|
In this sample we use a pre-trained SSD-based network and its output
|
||||||
|
needs to be parsed to an array of detections (object regions of
|
||||||
|
interest, ROIs). It is done by a custom operation `custom::PostProc`,
|
||||||
|
which returns an array of rectangles (of type `cv::GArray<cv::Rect>`)
|
||||||
|
back to the pipeline. This operation also filters out results by a
|
||||||
|
confidence threshold -- and these details are hidden in the kernel
|
||||||
|
itself. Still, at the moment of graph construction we operate with
|
||||||
|
interfaces only and don't need actual kernels to express the pipeline
|
||||||
|
-- so the implementation of this post-processing will be listed later.
|
||||||
|
|
||||||
|
After detection result output is parsed to an array of objects, we can run
|
||||||
|
classification on any of those. G-API doesn't support syntax for
|
||||||
|
in-graph loops like `for_each()` yet, but instead cv::gapi::infer<>
|
||||||
|
comes with a special list-oriented overload.
|
||||||
|
|
||||||
|
User can call cv::gapi::infer<> with a cv::GArray as the first
|
||||||
|
argument, so then G-API assumes it needs to run the associated network
|
||||||
|
on every rectangle from the given list of the given frame (second
|
||||||
|
argument). Result of such operation is also a list -- a cv::GArray of
|
||||||
|
cv::GMat.
|
||||||
|
|
||||||
|
Since `AgeGender` network itself produces two outputs, it's output
|
||||||
|
type for a list-based version of cv::gapi::infer is a tuple of
|
||||||
|
arrays. We use `std::tie()` to decompose this input into two distinct
|
||||||
|
objects.
|
||||||
|
|
||||||
|
`Emotions` network produces a single output so its list-based
|
||||||
|
inference's return type is `cv::GArray<cv::GMat>`.
|
||||||
|
|
||||||
|
# Configuring the pipeline {#gapi_ifd_configuration}
|
||||||
|
|
||||||
|
G-API strictly separates construction from configuration -- with the
|
||||||
|
idea to keep algorithm code itself platform-neutral. In the above
|
||||||
|
listings we only declared our operations and expressed the overall
|
||||||
|
data flow, but didn't even mention that we use OpenVINO™. We only
|
||||||
|
described *what* we do, but not *how* we do it. Keeping these two
|
||||||
|
aspects clearly separated is the design goal for G-API.
|
||||||
|
|
||||||
|
Platform-specific details arise when the pipeline is *compiled* --
|
||||||
|
i.e. is turned from a declarative to an executable form. The way *how*
|
||||||
|
to run stuff is specified via compilation arguments, and new
|
||||||
|
inference/streaming features are no exception from this rule.
|
||||||
|
|
||||||
|
G-API is built on backends which implement interfaces (see
|
||||||
|
[Architecture] and [Kernels] for details) -- thus cv::gapi::infer<> is
|
||||||
|
a function which can be implemented by different backends. In OpenCV
|
||||||
|
4.2, only OpenVINO™ Inference Engine backend for inference is
|
||||||
|
available. Every inference backend in G-API has to provide a special
|
||||||
|
parameterizable structure to express *backend-specific* neural network
|
||||||
|
parameters -- and in this case, it is cv::gapi::ie::Params:
|
||||||
|
|
||||||
|
@snippet cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp Param_Cfg
|
||||||
|
|
||||||
|
Here we define three parameter objects: `det_net`, `age_net`, and
|
||||||
|
`emo_net`. Every object is a cv::gapi::ie::Params structure
|
||||||
|
parametrization for each particular network we use. On a compilation
|
||||||
|
stage, G-API automatically matches network parameters with their
|
||||||
|
cv::gapi::infer<> calls in graph using this information.
|
||||||
|
|
||||||
|
Regardless of the topology, every parameter structure is constructed
|
||||||
|
with three string arguments -- specific to the OpenVINO™ Inference
|
||||||
|
Engine:
|
||||||
|
1. Path to the topology's intermediate representation (.xml file);
|
||||||
|
2. Path to the topology's model weights (.bin file);
|
||||||
|
3. Device where to run -- "CPU", "GPU", and others -- based on your
|
||||||
|
OpenVINO™ Toolkit installation.
|
||||||
|
These arguments are taken from the command-line parser.
|
||||||
|
|
||||||
|
Once networks are defined and custom kernels are implemented, the
|
||||||
|
pipeline is compiled for streaming:
|
||||||
|
|
||||||
|
@snippet cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp Compile
|
||||||
|
|
||||||
|
cv::GComputation::compileStreaming() triggers a special video-oriented
|
||||||
|
form of graph compilation where G-API is trying to optimize
|
||||||
|
throughput. Result of this compilation is an object of special type
|
||||||
|
cv::GStreamingCompiled -- in constract to a traditional callable
|
||||||
|
cv::GCompiled, these objects are closer to media players in their
|
||||||
|
semantics.
|
||||||
|
|
||||||
|
@note There is no need to pass metadata arguments describing the
|
||||||
|
format of the input video stream in
|
||||||
|
cv::GComputation::compileStreaming() -- G-API figures automatically
|
||||||
|
what are the formats of the input vector and adjusts the pipeline to
|
||||||
|
these formats on-the-fly. User still can pass metadata there as with
|
||||||
|
regular cv::GComputation::compile() in order to fix the pipeline to
|
||||||
|
the specific input format.
|
||||||
|
|
||||||
|
# Running the pipeline {#gapi_ifd_running}
|
||||||
|
|
||||||
|
Pipelining optimization is based on processing multiple input video
|
||||||
|
frames simultaneously, running different steps of the pipeline in
|
||||||
|
parallel. This is why it works best when the framework takes full
|
||||||
|
control over the video stream.
|
||||||
|
|
||||||
|
The idea behind streaming API is that user specifies an *input source*
|
||||||
|
to the pipeline and then G-API manages its execution automatically
|
||||||
|
until the source ends or user interrupts the execution. G-API pulls
|
||||||
|
new image data from the source and passes it to the pipeline for
|
||||||
|
processing.
|
||||||
|
|
||||||
|
Streaming sources are represented by the interface
|
||||||
|
cv::gapi::wip::IStreamSource. Objects implementing this interface may
|
||||||
|
be passed to `GStreamingCompiled` as regular inputs via `cv::gin()`
|
||||||
|
helper function. In OpenCV 4.2, only one streaming source is allowed
|
||||||
|
per pipeline -- this requirement will be relaxed in the future.
|
||||||
|
|
||||||
|
OpenCV comes with a great class cv::VideoCapture and by default G-API
|
||||||
|
ships with a stream source class based on it --
|
||||||
|
cv::gapi::wip::GCaptureSource. Users can implement their own
|
||||||
|
streaming sources e.g. using [VAAPI] or other Media or Networking
|
||||||
|
APIs.
|
||||||
|
|
||||||
|
Sample application specifies the input source as follows:
|
||||||
|
|
||||||
|
@snippet cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp Source
|
||||||
|
|
||||||
|
Please note that a GComputation may still have multiple inputs like
|
||||||
|
cv::GMat, cv::GScalar, or cv::GArray objects. User can pass their
|
||||||
|
respective host-side types (cv::Mat, cv::Scalar, std::vector<>) in the
|
||||||
|
input vector as well, but in Streaming mode these objects will create
|
||||||
|
"endless" constant streams. Mixing a real video source stream and a
|
||||||
|
const data stream is allowed.
|
||||||
|
|
||||||
|
Running a pipeline is easy -- just call
|
||||||
|
cv::GStreamingCompiled::start() and fetch your data with blocking
|
||||||
|
cv::GStreamingCompiled::pull() or non-blocking
|
||||||
|
cv::GStreamingCompiled::try_pull(); repeat until the stream ends:
|
||||||
|
|
||||||
|
@snippet cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp Run
|
||||||
|
|
||||||
|
The above code may look complex but in fact it handles two modes --
|
||||||
|
with and without graphical user interface (GUI):
|
||||||
|
- When a sample is running in a "headless" mode (`--pure` option is
|
||||||
|
set), this code simply pulls data from the pipeline with the
|
||||||
|
blocking `pull()` until it ends. This is the most performant mode of
|
||||||
|
execution.
|
||||||
|
- When results are also displayed on the screen, the Window System
|
||||||
|
needs to take some time to refresh the window contents and handle
|
||||||
|
GUI events. In this case, the demo pulls data with a non-blocking
|
||||||
|
`try_pull()` until there is no more data available (but it does not
|
||||||
|
mark end of the stream -- just means new data is not ready yet), and
|
||||||
|
only then displays the latest obtained result and refreshes the
|
||||||
|
screen. Reducing the time spent in GUI with this trick increases the
|
||||||
|
overall performance a little bit.
|
||||||
|
|
||||||
|
# Comparison with serial mode {#gapi_ifd_comparison}
|
||||||
|
|
||||||
|
The sample can also run in a serial mode for a reference and
|
||||||
|
benchmarking purposes. In this case, a regular
|
||||||
|
cv::GComputation::compile() is used and a regular single-frame
|
||||||
|
cv::GCompiled object is produced; the pipelining optimization is not
|
||||||
|
applied within G-API; it is the user responsibility to acquire image
|
||||||
|
frames from cv::VideoCapture object and pass those to G-API.
|
||||||
|
|
||||||
|
@snippet cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp Run_Serial
|
||||||
|
|
||||||
|
On a test machine (Intel® Core™ i5-6600), with OpenCV built with
|
||||||
|
[Intel® TBB]
|
||||||
|
support, detector network assigned to CPU, and classifiers to iGPU,
|
||||||
|
the pipelined sample outperformes the serial one by the factor of
|
||||||
|
1.36x (thus adding +36% in overall throughput).
|
||||||
|
|
||||||
|
# Conclusion {#gapi_ifd_conclusion}
|
||||||
|
|
||||||
|
G-API introduces a technological way to build and optimize hybrid
|
||||||
|
pipelines. Switching to a new execution model does not require changes
|
||||||
|
in the algorithm code expressed with G-API -- only the way how graph
|
||||||
|
is triggered differs.
|
||||||
|
|
||||||
|
# Listing: post-processing kernel {#gapi_ifd_pp}
|
||||||
|
|
||||||
|
G-API gives an easy way to plug custom code into the pipeline even if
|
||||||
|
it is running in a streaming mode and processing tensor
|
||||||
|
data. Inference results are represented by multi-dimensional cv::Mat
|
||||||
|
objects so accessing those is as easy as with a regular DNN module.
|
||||||
|
|
||||||
|
The OpenCV-based SSD post-processing kernel is defined and implemented in this
|
||||||
|
sample as follows:
|
||||||
|
|
||||||
|
@snippet cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp Postproc
|
||||||
|
|
||||||
|
["Interactive Face Detection"]: https://github.com/opencv/open_model_zoo/tree/master/demos/interactive_face_detection_demo
|
||||||
|
[core]: @ref gapi_core
|
||||||
|
[imgproc]: @ref gapi_imgproc
|
||||||
|
[Architecture]: @ref gapi_hld
|
||||||
|
[Kernels]: @ref gapi_kernel_api
|
||||||
|
[VAAPI]: https://01.org/vaapi
|
@ -3,6 +3,20 @@
|
|||||||
In this section you will learn about graph-based image processing and
|
In this section you will learn about graph-based image processing and
|
||||||
how G-API module can be used for that.
|
how G-API module can be used for that.
|
||||||
|
|
||||||
|
- @subpage tutorial_gapi_interactive_face_detection
|
||||||
|
|
||||||
|
*Languages:* C++
|
||||||
|
|
||||||
|
*Compatibility:* \> OpenCV 4.2
|
||||||
|
|
||||||
|
*Author:* Dmitry Matveev
|
||||||
|
|
||||||
|
This tutorial illustrates how to build a hybrid video processing
|
||||||
|
pipeline with G-API where Deep Learning and image processing are
|
||||||
|
combined effectively to maximize the overall throughput. This
|
||||||
|
sample requires Intel® distribution of OpenVINO™ Toolkit version
|
||||||
|
2019R2 or later.
|
||||||
|
|
||||||
- @subpage tutorial_gapi_anisotropic_segmentation
|
- @subpage tutorial_gapi_anisotropic_segmentation
|
||||||
|
|
||||||
*Languages:* C++
|
*Languages:* C++
|
||||||
|
@ -24,12 +24,13 @@ namespace wip {
|
|||||||
* Implement this interface if you want customize the way how data is
|
* Implement this interface if you want customize the way how data is
|
||||||
* streaming into GStreamingCompiled.
|
* streaming into GStreamingCompiled.
|
||||||
*
|
*
|
||||||
* Objects implementing this interface can be passes to
|
* Objects implementing this interface can be passed to
|
||||||
* GStreamingCompiled via setSource()/cv::gin(). Regular compiled
|
* GStreamingCompiled using setSource() with cv::gin(). Regular
|
||||||
* graphs (GCompiled) don't support input objects of this type.
|
* compiled graphs (GCompiled) don't support input objects of this
|
||||||
|
* type.
|
||||||
*
|
*
|
||||||
* Default cv::VideoCapture-based implementation is available, see
|
* Default cv::VideoCapture-based implementation is available, see
|
||||||
* cv::gapi::GCaptureSource.
|
* cv::gapi::wip::GCaptureSource.
|
||||||
*
|
*
|
||||||
* @note stream sources are passed to G-API via shared pointers, so
|
* @note stream sources are passed to G-API via shared pointers, so
|
||||||
* please use ptr() when passing a IStreamSource implementation to
|
* please use ptr() when passing a IStreamSource implementation to
|
||||||
|
@ -30,7 +30,8 @@ const std::string keys =
|
|||||||
"{ emom | | IE emotions recognition model IR }"
|
"{ emom | | IE emotions recognition model IR }"
|
||||||
"{ emow | | IE emotions recognition model weights }"
|
"{ emow | | IE emotions recognition model weights }"
|
||||||
"{ emod | | IE emotions recognition model device }"
|
"{ emod | | IE emotions recognition model device }"
|
||||||
"{ pure | | When set, no output is displayed. Useful for benchmarking }";
|
"{ pure | | When set, no output is displayed. Useful for benchmarking }"
|
||||||
|
"{ ser | | Run serially (no pipelining involved). Useful for benchmarking }";
|
||||||
|
|
||||||
struct Avg {
|
struct Avg {
|
||||||
struct Elapsed {
|
struct Elapsed {
|
||||||
@ -73,6 +74,7 @@ namespace custom {
|
|||||||
// executed. The _how_ is defined at graph compilation stage (via parameters),
|
// executed. The _how_ is defined at graph compilation stage (via parameters),
|
||||||
// not on the graph construction stage.
|
// not on the graph construction stage.
|
||||||
|
|
||||||
|
//! [G_API_NET]
|
||||||
// Face detector: takes one Mat, returns another Mat
|
// Face detector: takes one Mat, returns another Mat
|
||||||
G_API_NET(Faces, <cv::GMat(cv::GMat)>, "face-detector");
|
G_API_NET(Faces, <cv::GMat(cv::GMat)>, "face-detector");
|
||||||
|
|
||||||
@ -84,7 +86,9 @@ G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "age-gender-recoginition");
|
|||||||
|
|
||||||
// Emotion recognition - takes one Mat, returns another.
|
// Emotion recognition - takes one Mat, returns another.
|
||||||
G_API_NET(Emotions, <cv::GMat(cv::GMat)>, "emotions-recognition");
|
G_API_NET(Emotions, <cv::GMat(cv::GMat)>, "emotions-recognition");
|
||||||
|
//! [G_API_NET]
|
||||||
|
|
||||||
|
//! [Postproc]
|
||||||
// SSD Post-processing function - this is not a network but a kernel.
|
// SSD Post-processing function - this is not a network but a kernel.
|
||||||
// The kernel body is declared separately, this is just an interface.
|
// The kernel body is declared separately, this is just an interface.
|
||||||
// This operation takes two Mats (detections and the source image),
|
// This operation takes two Mats (detections and the source image),
|
||||||
@ -101,6 +105,7 @@ G_API_OP(PostProc, <cv::GArray<cv::Rect>(cv::GMat, cv::GMat)>, "custom.fd_postpr
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// OpenCV-based implementation of the above kernel.
|
||||||
GAPI_OCV_KERNEL(OCVPostProc, PostProc) {
|
GAPI_OCV_KERNEL(OCVPostProc, PostProc) {
|
||||||
static void run(const cv::Mat &in_ssd_result,
|
static void run(const cv::Mat &in_ssd_result,
|
||||||
const cv::Mat &in_frame,
|
const cv::Mat &in_frame,
|
||||||
@ -124,10 +129,12 @@ GAPI_OCV_KERNEL(OCVPostProc, PostProc) {
|
|||||||
if (image_id < 0.f) { // indicates end of detections
|
if (image_id < 0.f) { // indicates end of detections
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (confidence < 0.5f) { // fixme: hard-coded snapshot
|
if (confidence < 0.5f) { // a hard-coded snapshot
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Convert floating-point coordinates to the absolute image
|
||||||
|
// frame coordinates; clip by the source image boundaries.
|
||||||
cv::Rect rc;
|
cv::Rect rc;
|
||||||
rc.x = static_cast<int>(rc_left * upscale.width);
|
rc.x = static_cast<int>(rc_left * upscale.width);
|
||||||
rc.y = static_cast<int>(rc_top * upscale.height);
|
rc.y = static_cast<int>(rc_top * upscale.height);
|
||||||
@ -137,6 +144,8 @@ GAPI_OCV_KERNEL(OCVPostProc, PostProc) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
//! [Postproc]
|
||||||
|
|
||||||
} // namespace custom
|
} // namespace custom
|
||||||
|
|
||||||
namespace labels {
|
namespace labels {
|
||||||
@ -208,9 +217,11 @@ int main(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
const std::string input = cmd.get<std::string>("input");
|
const std::string input = cmd.get<std::string>("input");
|
||||||
const bool no_show = cmd.get<bool>("pure");
|
const bool no_show = cmd.get<bool>("pure");
|
||||||
|
const bool be_serial = cmd.get<bool>("ser");
|
||||||
|
|
||||||
// Express our processing pipeline. Lambda-based constructor
|
// Express our processing pipeline. Lambda-based constructor
|
||||||
// is used to keep all temporary objects in a dedicated scope.
|
// is used to keep all temporary objects in a dedicated scope.
|
||||||
|
//! [GComputation]
|
||||||
cv::GComputation pp([]() {
|
cv::GComputation pp([]() {
|
||||||
// Declare an empty GMat - the beginning of the pipeline.
|
// Declare an empty GMat - the beginning of the pipeline.
|
||||||
cv::GMat in;
|
cv::GMat in;
|
||||||
@ -256,6 +267,7 @@ int main(int argc, char *argv[])
|
|||||||
return cv::GComputation(cv::GIn(in),
|
return cv::GComputation(cv::GIn(in),
|
||||||
cv::GOut(frame, faces, ages, genders, emotions));
|
cv::GOut(frame, faces, ages, genders, emotions));
|
||||||
});
|
});
|
||||||
|
//! [GComputation]
|
||||||
|
|
||||||
// Note: it might be very useful to have dimensions loaded at this point!
|
// Note: it might be very useful to have dimensions loaded at this point!
|
||||||
// After our computation is defined, specify how it should be executed.
|
// After our computation is defined, specify how it should be executed.
|
||||||
@ -269,7 +281,8 @@ int main(int argc, char *argv[])
|
|||||||
//
|
//
|
||||||
// OpenCV DNN backend will have its own parmater structure with settings
|
// OpenCV DNN backend will have its own parmater structure with settings
|
||||||
// relevant to OpenCV DNN module. Same applies to other possible inference
|
// relevant to OpenCV DNN module. Same applies to other possible inference
|
||||||
// backends, like cuDNN, etc (:-))
|
// backends...
|
||||||
|
//! [Param_Cfg]
|
||||||
auto det_net = cv::gapi::ie::Params<custom::Faces> {
|
auto det_net = cv::gapi::ie::Params<custom::Faces> {
|
||||||
cmd.get<std::string>("fdm"), // read cmd args: path to topology IR
|
cmd.get<std::string>("fdm"), // read cmd args: path to topology IR
|
||||||
cmd.get<std::string>("fdw"), // read cmd args: path to weights
|
cmd.get<std::string>("fdw"), // read cmd args: path to weights
|
||||||
@ -287,57 +300,102 @@ int main(int argc, char *argv[])
|
|||||||
cmd.get<std::string>("emow"), // read cmd args: path to weights
|
cmd.get<std::string>("emow"), // read cmd args: path to weights
|
||||||
cmd.get<std::string>("emod"), // read cmd args: device specifier
|
cmd.get<std::string>("emod"), // read cmd args: device specifier
|
||||||
};
|
};
|
||||||
|
//! [Param_Cfg]
|
||||||
|
|
||||||
|
//! [Compile]
|
||||||
// Form a kernel package (with a single OpenCV-based implementation of our
|
// Form a kernel package (with a single OpenCV-based implementation of our
|
||||||
// post-processing) and a network package (holding our three networks).x
|
// post-processing) and a network package (holding our three networks).
|
||||||
auto kernels = cv::gapi::kernels<custom::OCVPostProc>();
|
auto kernels = cv::gapi::kernels<custom::OCVPostProc>();
|
||||||
auto networks = cv::gapi::networks(det_net, age_net, emo_net);
|
auto networks = cv::gapi::networks(det_net, age_net, emo_net);
|
||||||
|
|
||||||
// Compile our pipeline for a specific input image format (TBD - can be relaxed)
|
// Compile our pipeline and pass our kernels & networks as
|
||||||
// and pass our kernels & networks as parameters.
|
// parameters. This is the place where G-API learns which
|
||||||
// This is the place where G-API learns which networks & kernels we're actually
|
// networks & kernels we're actually operating with (the graph
|
||||||
// operating with (the graph description itself known nothing about that).
|
// description itself known nothing about that).
|
||||||
auto cc = pp.compileStreaming(cv::GMatDesc{CV_8U,3,cv::Size(1280,720)},
|
auto cc = pp.compileStreaming(cv::compile_args(kernels, networks));
|
||||||
cv::compile_args(kernels, networks));
|
//! [Compile]
|
||||||
|
|
||||||
std::cout << "Reading " << input << std::endl;
|
|
||||||
cc.setSource(cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(input));
|
|
||||||
|
|
||||||
Avg avg;
|
Avg avg;
|
||||||
avg.start();
|
std::size_t frames = 0u; // Frame counter (not produced by the graph)
|
||||||
cc.start();
|
|
||||||
|
|
||||||
cv::Mat frame;
|
std::cout << "Reading " << input << std::endl;
|
||||||
std::vector<cv::Rect> faces;
|
// Duplicate huge portions of the code in if/else branches in the sake of
|
||||||
std::vector<cv::Mat> out_ages;
|
// better documentation snippets
|
||||||
std::vector<cv::Mat> out_genders;
|
if (!be_serial) {
|
||||||
std::vector<cv::Mat> out_emotions;
|
//! [Source]
|
||||||
std::size_t frames = 0u;
|
auto in_src = cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(input);
|
||||||
|
cc.setSource(cv::gin(in_src));
|
||||||
|
//! [Source]
|
||||||
|
|
||||||
// Implement different execution policies depending on the display option
|
avg.start();
|
||||||
// for the best performance.
|
|
||||||
while (cc.running()) {
|
//! [Run]
|
||||||
auto out_vector = cv::gout(frame, faces, out_ages, out_genders, out_emotions);
|
// After data source is specified, start the execution
|
||||||
if (no_show) {
|
cc.start();
|
||||||
// This is purely a video processing. No need to balance with UI rendering.
|
|
||||||
// Use a blocking pull() to obtain data. Break the loop if the stream is over.
|
// Declare data objects we will be receiving from the pipeline.
|
||||||
if (!cc.pull(std::move(out_vector)))
|
cv::Mat frame; // The captured frame itself
|
||||||
break;
|
std::vector<cv::Rect> faces; // Array of detected faces
|
||||||
} else if (!cc.try_pull(std::move(out_vector))) {
|
std::vector<cv::Mat> out_ages; // Array of inferred ages (one blob per face)
|
||||||
// Use a non-blocking try_pull() to obtain data.
|
std::vector<cv::Mat> out_genders; // Array of inferred genders (one blob per face)
|
||||||
// If there's no data, let UI refresh (and handle keypress)
|
std::vector<cv::Mat> out_emotions; // Array of classified emotions (one blob per face)
|
||||||
if (cv::waitKey(1) >= 0) break;
|
|
||||||
else continue;
|
// Implement different execution policies depending on the display option
|
||||||
|
// for the best performance.
|
||||||
|
while (cc.running()) {
|
||||||
|
auto out_vector = cv::gout(frame, faces, out_ages, out_genders, out_emotions);
|
||||||
|
if (no_show) {
|
||||||
|
// This is purely a video processing. No need to balance
|
||||||
|
// with UI rendering. Use a blocking pull() to obtain
|
||||||
|
// data. Break the loop if the stream is over.
|
||||||
|
if (!cc.pull(std::move(out_vector)))
|
||||||
|
break;
|
||||||
|
} else if (!cc.try_pull(std::move(out_vector))) {
|
||||||
|
// Use a non-blocking try_pull() to obtain data.
|
||||||
|
// If there's no data, let UI refresh (and handle keypress)
|
||||||
|
if (cv::waitKey(1) >= 0) break;
|
||||||
|
else continue;
|
||||||
|
}
|
||||||
|
// At this point we have data for sure (obtained in either
|
||||||
|
// blocking or non-blocking way).
|
||||||
|
frames++;
|
||||||
|
labels::DrawResults(frame, faces, out_ages, out_genders, out_emotions);
|
||||||
|
labels::DrawFPS(frame, frames, avg.fps(frames));
|
||||||
|
if (!no_show) cv::imshow("Out", frame);
|
||||||
}
|
}
|
||||||
// At this point we have data for sure (obtained in either blocking or non-blocking way).
|
//! [Run]
|
||||||
frames++;
|
} else { // (serial flag)
|
||||||
labels::DrawResults(frame, faces, out_ages, out_genders, out_emotions);
|
//! [Run_Serial]
|
||||||
labels::DrawFPS(frame, frames, avg.fps(frames));
|
cv::VideoCapture cap(input);
|
||||||
if (!no_show) cv::imshow("Out", frame);
|
cv::Mat in_frame, frame; // The captured frame itself
|
||||||
}
|
std::vector<cv::Rect> faces; // Array of detected faces
|
||||||
cc.stop();
|
std::vector<cv::Mat> out_ages; // Array of inferred ages (one blob per face)
|
||||||
std::cout << "Processed " << frames << " frames in " << avg.elapsed() << std::endl;
|
std::vector<cv::Mat> out_genders; // Array of inferred genders (one blob per face)
|
||||||
|
std::vector<cv::Mat> out_emotions; // Array of classified emotions (one blob per face)
|
||||||
|
|
||||||
|
while (cap.read(in_frame)) {
|
||||||
|
pp.apply(cv::gin(in_frame),
|
||||||
|
cv::gout(frame, faces, out_ages, out_genders, out_emotions),
|
||||||
|
cv::compile_args(kernels, networks));
|
||||||
|
labels::DrawResults(frame, faces, out_ages, out_genders, out_emotions);
|
||||||
|
frames++;
|
||||||
|
if (frames == 1u) {
|
||||||
|
// Start timer only after 1st frame processed -- compilation
|
||||||
|
// happens on-the-fly here
|
||||||
|
avg.start();
|
||||||
|
} else {
|
||||||
|
// Measurfe & draw FPS for all other frames
|
||||||
|
labels::DrawFPS(frame, frames, avg.fps(frames-1));
|
||||||
|
}
|
||||||
|
if (!no_show) {
|
||||||
|
cv::imshow("Out", frame);
|
||||||
|
if (cv::waitKey(1) >= 0) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//! [Run_Serial]
|
||||||
|
}
|
||||||
|
std::cout << "Processed " << frames << " frames in " << avg.elapsed()
|
||||||
|
<< " (" << avg.fps(frames) << " FPS)" << std::endl;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
Loading…
Reference in New Issue
Block a user