From d8b1fc45aae30480ea033e5f9393ebe40ca65f99 Mon Sep 17 00:00:00 2001
From: Sinitsina Maria <49319156+SinM9@users.noreply.github.com>
Date: Tue, 14 Dec 2021 20:33:26 +0300
Subject: [PATCH] Merge pull request #20934 from SinM9:spectrogram_samples

AudioIO: add spectrogram samples for C++/python
---
 samples/cpp/audio_spectrogram.cpp   | 1071 +++++++++++++++++++++++++++
 samples/python/audio_spectrogram.py |  804 ++++++++++++++++++++
 2 files changed, 1875 insertions(+)
 create mode 100644 samples/cpp/audio_spectrogram.cpp
 create mode 100644 samples/python/audio_spectrogram.py
diff --git a/samples/cpp/audio_spectrogram.cpp b/samples/cpp/audio_spectrogram.cpp
new file mode 100644
index 0000000000..80bfc1fbde
--- /dev/null
+++ b/samples/cpp/audio_spectrogram.cpp
@@ -0,0 +1,1071 @@
+#include <opencv2/core.hpp>
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <cmath>
+using namespace cv;
+using namespace std;
+
+
+class AudioDrawing
+{
+
+public:
+
+    AudioDrawing(const CommandLineParser& parser) {
+        if (!initAndCheckArgs(parser))
+        {
+            cerr << "Error: Wrong input arguments" << endl;
+            exit(0);
+        }
+        Draw();
+    }
+
+    void Draw() {
+        if (draw == "static")
+        {
+            vector<int>inputAudio = {};
+            int samplingRate = 0;
+            if (inputType == "file")
+            {
+                samplingRate = readAudioFile(audio, inputAudio);
+            }
+            else if (inputType == "microphone")
+            {
+                samplingRate = readAudioMicrophone(inputAudio);
+            }
+            if ((inputAudio.size() == 0) || samplingRate <= 0)
+            {
+                cerr << "Error: problems with audio reading, check input arguments" << endl;
+                return;
+            }
+
+            int duration = static_cast<int>(inputAudio.size()) / samplingRate;
+
+            // since the dimensional grid is counted in integer seconds,
+            // if the input audio has an incomplete last second,
+            // then it is filled with zeros to complete
+            int remainder = static_cast<int>(inputAudio.size()) % samplingRate;
+            if (remainder)
+            {
+                int sizeToFullSec = samplingRate - remainder;
+                for (int j = 0; j < sizeToFullSec; ++j)
+                {
+                    inputAudio.push_back(0);
+                }
+                duration += 1;
+                cout << "Update duration of audio to full last second with " <<
+                        sizeToFullSec << " zero samples" << endl;
+                cout << "New number of samples " << inputAudio.size() << endl;
+            }
+            cout << "Duration of audio = " << duration << " seconds" << endl;
+
+            // since the dimensional grid is counted in integer seconds,
+            // if duration of file is less than xmarkup, to avoid an incorrect display,
+            // xmarkup will be taken equal to duration
+            if (duration <= xmarkup)
+            {
+                xmarkup = duration + 1;
+            }
+
+            if (graph == "ampl")
+            {
+                Mat imgAmplitude = drawAmplitude(inputAudio);
+                imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate);
+                imshow("Display amplitude graph", imgAmplitude);
+                waitKey(0);
+            }
+            else if (graph == "spec")
+            {
+                vector<vector<double>>stft = STFT(inputAudio);
+                Mat imgSpec = drawSpectrogram(stft);
+                imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft);
+                imshow("Display spectrogram", imgSpec);
+                waitKey(0);
+            }
+            else if (graph == "ampl_and_spec")
+            {
+                Mat imgAmplitude = drawAmplitude(inputAudio);
+                imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate);
+                vector<vector<double>>stft = STFT(inputAudio);
+                Mat imgSpec = drawSpectrogram(stft);
+                imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft);
+                Mat imgTotal = concatenateImages(imgAmplitude, imgSpec);
+                imshow("Display amplitude graph and spectrogram", imgTotal);
+                waitKey(0);
+            }
+        }
+        else if (draw == "dynamic")
+        {
+            if (inputType == "file")
+            {
+                dynamicFile(audio);
+            }
+            else if (inputType == "microphone")
+            {
+                dynamicMicrophone();
+            }
+        }
+    }
+
+    ~AudioDrawing() {
+    }
+
+    int readAudioFile(string file, vector<int>& inputAudio)
+    {
+        VideoCapture cap;
+        vector<int> params {    CAP_PROP_AUDIO_STREAM, audioStream,
+                                CAP_PROP_VIDEO_STREAM, -1,
+                                CAP_PROP_AUDIO_DATA_DEPTH, CV_16S   };
+
+        cap.open(file, CAP_ANY, params);
+        if (!cap.isOpened())
+        {
+            cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl;
+            return -1;
+        }
+        const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX);
+        const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
+        cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl;
+        int samplingRate =  static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
+        cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
+        cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
+        cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
+
+        vector<int> frameVec;
+        Mat frame;
+        for (;;)
+        {
+            if (cap.grab())
+            {
+                cap.retrieve(frame, audioBaseIndex);
+                frameVec = frame;
+                inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end());
+            }
+            else
+            {
+                cout << "Number of samples: " << inputAudio.size() << endl;
+                break;
+            }
+        }
+        return samplingRate;
+    }
+
+    int readAudioMicrophone(vector<int>& inputAudio)
+    {
+        VideoCapture cap;
+        vector<int> params {    CAP_PROP_AUDIO_STREAM, 0,
+                                CAP_PROP_VIDEO_STREAM, -1   };
+
+        cap.open(0, CAP_ANY, params);
+        if (!cap.isOpened())
+        {
+            cerr << "Error: Can't open microphone" << endl;
+            return -1;
+        }
+
+        const int audioBaseIndex =  static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
+        const int numberOfChannels =  static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
+        cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString( static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
+        int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
+        cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << samplingRate << endl;
+        cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
+        cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
+
+        const double cvTickFreq = getTickFrequency();
+        int64 sysTimeCurr = getTickCount();
+        int64 sysTimePrev = sysTimeCurr;
+
+        vector<int> frameVec;
+        Mat frame;
+        while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime)
+        {
+            if (cap.grab())
+            {
+                cap.retrieve(frame, audioBaseIndex);
+                frameVec = frame;
+                inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end());
+                sysTimeCurr = getTickCount();
+            }
+            else
+            {
+                cerr << "Error: Grab error" << endl;
+                break;
+            }
+        }
+        cout << "Number of samples: " << inputAudio.size() << endl;
+        return samplingRate;
+    }
+
+
+    Mat drawAmplitude(vector<int>& inputAudio)
+    {
+        Scalar color = Scalar(247,111,87);
+        int thickness = 5;
+        int frameVectorRows = 500;
+        int middle = frameVectorRows / 2;
+        // usually the input data is too big, so it is necessary
+        // to reduce size using interpolation of data
+        int frameVectorCols = 40000;
+        if (static_cast<int>(inputAudio.size()) < frameVectorCols)
+        {
+            frameVectorCols = static_cast<int>(inputAudio.size());
+        }
+
+        Mat img(frameVectorRows, frameVectorCols, CV_8UC3 , Scalar(255,255,255)); // white background
+
+        vector<double>reshapeAudio(inputAudio.size());
+        for (size_t i = 0; i < inputAudio.size(); ++i)
+        {
+            reshapeAudio[i]=static_cast<double>(inputAudio[i]);
+        }
+
+        Mat img_frameVector( 1, static_cast<int>(reshapeAudio.size()), CV_64F , reshapeAudio.data());
+        Mat img_frameVector_resize;
+        resize(img_frameVector, img_frameVector_resize, Size(frameVectorCols, 1), INTER_LINEAR);
+        reshapeAudio = img_frameVector_resize;
+
+        // normalization data by maximum element
+        normalize(reshapeAudio, reshapeAudio, 1.0, 0.0, NORM_INF);
+
+        for (size_t i = 0; i < reshapeAudio.size(); ++i)
+        {
+            reshapeAudio[i] = middle - reshapeAudio[i] * middle;
+        }
+
+        for (int i = 1; i < static_cast<int>(reshapeAudio.size()); ++i)
+        {
+            line(img, Point(i-1, static_cast<int>(reshapeAudio[i-1])), Point(i, static_cast<int>(reshapeAudio[i])), color, thickness);
+        }
+        Mat resImage;
+        resize(img, resImage, Size(900, 400), INTER_AREA );
+        return resImage;
+    }
+
+    Mat drawAmplitudeScale(Mat& inputImg, const vector<int>& inputAudio, int samplingRate,
+                           int xmin = 0, int xmax = 0)
+    {
+        // function of layout drawing for graph of volume amplitudes
+        // x axis for time
+        // y axis for amplitudes
+
+        // parameters for the new image size
+        int preCol = 100;
+        int aftCol = 100;
+        int preLine = 40;
+        int aftLine = 50;
+
+        int frameVectorRows = inputImg.rows;
+        int frameVectorCols = inputImg.cols;
+
+        int totalRows = preLine + frameVectorRows + aftLine;
+        int totalCols = preCol + frameVectorCols + aftCol;
+
+        Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3, Scalar(255, 255, 255));
+        inputImg.copyTo(imgTotal(Rect(preCol, preLine, inputImg.cols, inputImg.rows)));
+
+
+        // calculating values on x axis
+        if (xmax == 0)
+        {
+            xmax = static_cast<int>(inputAudio.size()) / samplingRate;
+        }
+        std::vector<double> xList(xmarkup);
+        if (xmax >= xmarkup)
+        {
+            double deltax = (xmax - xmin) / (xmarkup - 1);
+            for (int i = 0; i < xmarkup; ++i)
+            {
+                xList[i] = (xmin + deltax * i);
+            }
+        }
+        else
+        {
+            // this case is used to display a dynamic update
+            vector<double> tmpXList;
+            for (int i = xmin; i < xmax; ++i)
+            {
+                tmpXList.push_back(i + 1);
+            }
+            int k = 0;
+            for (int i = xmarkup - static_cast<int>(tmpXList.size()); i < xmarkup; ++i)
+            {
+                xList[i] = tmpXList[k];
+                k += 1;
+            }
+        }
+
+        // calculating values on y axis
+        double minCv; double maxCv; Point minLoc; Point maxLoc;
+        minMaxLoc(inputAudio, &minCv, &maxCv, &minLoc, &maxLoc);
+        int ymin = static_cast<int>(minCv);
+        int ymax = static_cast<int>(maxCv);
+
+        std::vector<double> yList(ymarkup);
+        double deltay = (ymax - ymin) / (ymarkup - 1);
+        for (int i = 0; i < ymarkup; ++i)
+        {
+            yList[i] = ymin + deltay * i;
+        }
+
+        // parameters for layout drawing
+        int textThickness = 1;
+        int gridThickness = 1;
+        Scalar gridColor(0, 0, 0);
+        Scalar textColor(0, 0, 0);
+        float fontScale = 0.5;
+
+        // horizontal axis
+        line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine),
+            gridColor, gridThickness);
+        // vertical axis
+        line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows),
+            gridColor, gridThickness);
+
+        // parameters for layout calculation
+        int serifSize = 10;
+        int indentDownX = serifSize * 2;
+        int indentDownY = serifSize / 2;
+        int indentLeftX = serifSize;
+        int indentLeftY = 2 * preCol / 3;
+
+
+        // drawing layout for x axis
+        int numX = frameVectorCols / (xmarkup - 1);
+        for (size_t i = 0; i < xList.size(); ++i)
+        {
+            int a1 = static_cast<int>(preCol + i * numX);
+            int a2 = frameVectorRows + preLine;
+
+            int b1 = a1;
+            int b2 = a2 + serifSize;
+
+            if (enableGrid)
+            {
+                int d1 = a1;
+                int d2 = preLine;
+                line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness);
+            }
+            line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
+            putText(imgTotal, to_string(int(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX),
+                    FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
+        }
+
+        // drawing layout for y axis
+        int numY = frameVectorRows / (ymarkup - 1);
+        for (size_t i = 0; i < yList.size(); ++i) {
+            int a1 = preCol;
+            int a2 = static_cast<int>(totalRows - aftLine - i * numY);
+            int b1 = preCol - serifSize;
+            int b2 = a2;
+            if (enableGrid)
+            {
+                int d1 = preCol + frameVectorCols;
+                int d2 = a2;
+                line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness);
+            }
+            line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
+            putText(imgTotal, to_string(int(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY),
+                    FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
+        }
+        Mat resImage;
+        resize(imgTotal, resImage, Size(cols, rows), INTER_AREA );
+        return resImage;
+    }
+
+    vector<vector<double>> STFT(const vector<int>& inputAudio)
+    {
+        // The Short-time Fourier transform (STFT), is a Fourier-related transform used to
+        // determine the sinusoidal frequency and phase content of local sections of a signal
+        // as it changes over time.
+        // In practice, the procedure for computing STFTs is to divide a longer time signal
+        // into shorter segments of equal length and then compute the Fourier transform separately
+        // on each shorter segment. This reveals the Fourier spectrum on each shorter segment.
+        // One then usually plots the changing spectra as a function of time, known as a spectrogram
+        // or waterfall plot.
+        // https://en.wikipedia.org/wiki/Short-time_Fourier_transform
+
+        int timeStep = windLen - overlap;
+        Mat dstMat;
+        vector<double> stftRow;
+        vector<double> WindType;
+        if (windowType == "Hann")
+        {
+            // https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
+            for (int j = 1 - windLen; j < windLen; j+=2)
+            {
+                WindType.push_back(j * (0.5 * (1 - cos(CV_PI * j / (windLen - 1)))));
+            }
+        }
+        else if (windowType == "Hamming")
+        {
+            // https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
+            for (int j = 1 - windLen; j < windLen; j+=2)
+            {
+                WindType.push_back(j * (0.53836 - 0.46164 * (cos(CV_PI * j / (windLen - 1)))));
+            }
+        }
+        for (size_t i = 0; i < inputAudio.size(); i += timeStep)
+        {
+            vector<double>section(windLen, 0);
+            for (int j = 0; j < windLen; ++j)
+            {
+                section[j] = inputAudio[j + i];
+            }
+            if (windowType == "Hann" || windowType == "Hamming")
+            {
+                for (size_t j = 0; j < section.size(); ++j)
+                {
+                    section[j] *= WindType[j];
+                }
+            }
+
+            dft(section, dstMat, DFT_COMPLEX_OUTPUT);
+
+            for (int j = 0; j < dstMat.cols / 4; ++j)
+            {
+                double complModule = sqrt(dstMat.at<double>(2*j) * dstMat.at<double>(2*j) +
+                                        dstMat.at<double>(2*j+1) * dstMat.at<double>(2*j+1));
+                stftRow.push_back(complModule);
+            }
+        }
+
+        size_t xSize = inputAudio.size() / timeStep + 1;
+        // we need only the first part of the spectrum, the second part is symmetrical
+        size_t ySize = dstMat.cols / 4;
+
+        vector<vector<double>> stft(ySize, vector<double>(xSize, 0.));
+        for (size_t i = 0; i < xSize; ++i)
+        {
+            for (size_t j = 0; j < ySize; ++j)
+            {
+                // write elements with transposition and convert it to the decibel scale
+                double stftElem = stftRow[ i * ySize + j];
+                if (stftElem != 0.)
+                {
+                    stft[j][i] = 10 * log10(stftElem);
+                }
+            }
+        }
+        return stft;
+    }
+
+    Mat drawSpectrogram(const vector<vector<double>>& stft)
+    {
+        int frameVectorRows = static_cast<int>(stft.size());
+        int frameVectorCols = static_cast<int>(stft[0].size());
+
+        // Normalization of image values from 0 to 255 to get more contrast image
+        // and this normalization will be taken into account in the scale drawing
+        int colormapImageRows = 255;
+
+        double minCv; double maxCv; Point minLoc; Point maxLoc;
+        minMaxLoc(stft[0], &minCv, &maxCv, &minLoc, &maxLoc);
+        double maxStft = max(abs(maxCv), abs(minCv));
+
+        for (int i = 1; i < frameVectorRows; ++i)
+        {
+            minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc);
+            maxStft = max(maxStft, max(abs(maxCv), abs(minCv)));
+        }
+        // if maxStft is zero (silence)
+        if (maxStft == 0.)
+        {
+            maxStft = 1;
+        }
+        Mat imgSpec(frameVectorRows, frameVectorCols, CV_8UC1, Scalar(255, 255, 255));
+
+        for (int i = 0; i < frameVectorRows; ++i)
+        {
+            for (int j = 0; j < frameVectorCols; ++j)
+            {
+                imgSpec.at<uchar>(frameVectorRows - i - 1, j) = static_cast<uchar>(stft[i][j] * colormapImageRows / maxStft);
+            }
+        }
+        applyColorMap(imgSpec, imgSpec, COLORMAP_INFERNO);
+        Mat resImage;
+        resize(imgSpec, resImage, Size(900, 400), INTER_AREA);
+        return resImage;
+    }
+
+    Mat drawSpectrogramColorbar(Mat& inputImg, const vector<int>& inputAudio,
+                                int samplingRate, const vector<vector<double>>& stft,
+                                int xmin = 0, int xmax = 0)
+    {
+        // function of layout drawing for the three-dimensional graph of the spectrogram
+        // x axis for time
+        // y axis for frequencies
+        // z axis for magnitudes of frequencies shown by color scale
+
+        // parameters for the new image size
+        int preCol = 100;
+        int aftCol = 100;
+        int preLine = 40;
+        int aftLine = 50;
+        int colColor = 20;
+        int indCol = 20;
+
+        int frameVectorRows = inputImg.rows;
+        int frameVectorCols = inputImg.cols;
+
+        int totalRows = preLine + frameVectorRows + aftLine;
+        int totalCols = preCol + frameVectorCols + aftCol;
+
+        Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255));
+        inputImg.copyTo(imgTotal(Rect(preCol, preLine, frameVectorCols, frameVectorRows)));
+
+        // colorbar image due to drawSpectrogram(..) picture has been normalised from 255 to 0,
+        // so here colorbar has values from 255 to 0
+        int colorArrSize = 256;
+        Mat imgColorBar = Mat (colorArrSize, colColor, CV_8UC1 , Scalar(255,255,255));
+        for (int i = 0; i < colorArrSize; ++i)
+        {
+            for( int j = 0; j < colColor; ++j)
+            {
+                imgColorBar.at<uchar>(i, j) = static_cast<uchar>(colorArrSize - 1 - i); // from 255 to 0
+            }
+        }
+
+        applyColorMap(imgColorBar, imgColorBar, COLORMAP_INFERNO);
+        resize(imgColorBar, imgColorBar, Size(colColor, frameVectorRows), INTER_AREA);
+        imgColorBar.copyTo(imgTotal(Rect(preCol + frameVectorCols + indCol, preLine, colColor, frameVectorRows)));
+
+
+        // calculating values on x axis
+        if (xmax == 0)
+        {
+            xmax = static_cast<int>(inputAudio.size()) / samplingRate + 1;
+        }
+        vector<double> xList(xmarkup, 0);
+        if (xmax >= xmarkup)
+        {
+            double deltax = (xmax - xmin) / (xmarkup - 1);
+            for(int i = 0; i < xmarkup; ++i)
+            {
+                xList[i] = xmin + deltax * i;
+            }
+        }
+        else
+        {
+            // this case is used to display a dynamic update
+            vector<double> tmpXList;
+            for(int i = xmin; i < xmax; ++i)
+            {
+                tmpXList.push_back(i + 1);
+            }
+            int k = 0;
+            for (int i = xmarkup - static_cast<int>(tmpXList.size()); i < xmarkup; ++i)
+            {
+                xList[i] = tmpXList[k];
+                k += 1;
+            }
+        }
+
+        // calculating values on y axis
+        // according to the Nyquist sampling theorem,
+        // signal should posses frequencies equal to half of sampling rate
+        int ymin = 0;
+        int ymax = static_cast<int>(samplingRate / 2);
+
+        vector<double> yList;
+        double deltay = (ymax - ymin) / (ymarkup - 1);
+        for(int i = 0; i < ymarkup; ++i)
+        {
+            yList.push_back(ymin + deltay * i);
+        }
+
+        // calculating values on z axis
+        double minCv; double maxCv; Point minLoc; Point maxLoc;
+        minMaxLoc( stft[0], &minCv, &maxCv, &minLoc, &maxLoc);
+        double zmin = minCv, zmax = maxCv;
+
+        std::vector<double> zList;
+        for (size_t i = 1; i < stft.size(); ++i)
+        {
+            minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc);
+            zmax = max(zmax, maxCv);
+            zmin = min(zmin, minCv);
+        }
+        double deltaz = (zmax - zmin) / (zmarkup - 1);
+        for(int i = 0; i < zmarkup; ++i)
+        {
+            zList.push_back(zmin + deltaz * i);
+        }
+
+        // parameters for layout drawing
+        int textThickness = 1;
+        int gridThickness = 1;
+        Scalar gridColor(0,0,0);
+        Scalar textColor(0,0,0);
+        float fontScale = 0.5;
+
+        int serifSize = 10;
+        int indentDownX = serifSize * 2;
+        int indentDownY = serifSize / 2;
+        int indentLeftX = serifSize;
+        int indentLeftY = 2 * preCol / 3;
+
+        // horizontal axis
+        line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine),
+                            gridColor, gridThickness);
+        // vertical axis
+        line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows),
+                            gridColor, gridThickness);
+
+        // drawing layout for x axis
+        int numX = frameVectorCols / (xmarkup - 1);
+        for (size_t i = 0; i < xList.size(); ++i)
+        {
+            int a1 = static_cast<int>(preCol + i * numX);
+            int a2 = frameVectorRows + preLine;
+
+            int b1 = a1;
+            int b2 = a2 + serifSize;
+
+            line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
+            putText(imgTotal, to_string(static_cast<int>(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX),
+                    FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
+        }
+
+        // drawing layout for y axis
+        int numY = frameVectorRows / (ymarkup - 1);
+        for (size_t i = 0; i < yList.size(); ++i)
+        {
+            int a1 = preCol;
+            int a2 = static_cast<int>(totalRows - aftLine - i * numY);
+
+            int b1 = preCol - serifSize;
+            int b2 = a2;
+
+            line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
+            putText(imgTotal, to_string(static_cast<int>(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY),
+                    FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
+        }
+
+        // drawing layout for z axis
+        int numZ = frameVectorRows / (zmarkup - 1);
+        for (size_t i = 0; i < zList.size(); ++i)
+        {
+            int a1 = preCol + frameVectorCols + indCol + colColor;
+            int a2 = static_cast<int>(totalRows - aftLine - i * numZ);
+
+            int b1 = a1 + serifSize;
+            int b2 = a2;
+
+            line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
+            putText(imgTotal, to_string(static_cast<int>(zList[i])), Point(b1 + 10, b2 + indentDownY),
+                    FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
+        }
+        Mat resImage;
+        resize(imgTotal, resImage, Size(cols, rows), INTER_AREA );
+        return resImage;
+    }
+
+    Mat concatenateImages(Mat& img1, Mat& img2)
+    {
+        // first image will be under the second image
+        int totalRows = img1.rows + img2.rows;
+        int totalCols = max(img1.cols , img2.cols);
+        // if images columns do not match, the difference is filled in white
+        Mat imgTotal = Mat (totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255));
+
+        img1.copyTo(imgTotal(Rect(0, 0, img1.cols, img1.rows)));
+        img2.copyTo(imgTotal(Rect(0, img1.rows, img2.cols, img2.rows)));
+        return imgTotal;
+    }
+
+    void dynamicFile(const string file)
+    {
+        VideoCapture cap;
+        vector<int> params {    CAP_PROP_AUDIO_STREAM, audioStream,
+                                CAP_PROP_VIDEO_STREAM, -1,
+                                CAP_PROP_AUDIO_DATA_DEPTH, CV_16S   };
+
+        cap.open(file, CAP_ANY, params);
+        if (!cap.isOpened())
+        {
+            cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl;
+            return;
+        }
+
+        const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
+        const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
+        int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
+
+        cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
+        cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
+        cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
+        cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
+
+        int step = static_cast<int>(updateTime * samplingRate);
+        int frameSize = static_cast<int>(frameSizeTime * samplingRate);
+
+        // since the dimensional grid is counted in integer seconds,
+        // if duration of audio frame is less than xmarkup, to avoid an incorrect display,
+        // xmarkup will be taken equal to duration
+        if (frameSizeTime <= xmarkup)
+        {
+            xmarkup = frameSizeTime;
+        }
+
+        vector<int> buffer;
+        vector<int> frameVector;
+        vector<int> section(frameSize, 0);
+        vector<vector<double>>stft;
+        Mat frame, imgAmplitude, imgSpec, imgTotal;
+        int currentSamples = 0;
+        int xmin = 0;
+        int xmax = 0;
+
+        for (;;)
+        {
+            if (cap.grab())
+            {
+                cap.retrieve(frame, audioBaseIndex);
+                frameVector = frame;
+                buffer.insert(buffer.end(), frameVector.begin(), frameVector.end());
+                int bufferSize = static_cast<int>(buffer.size());
+                if (bufferSize >= step)
+                {
+                    currentSamples += bufferSize;
+                    section.erase(section.begin(), section.begin() + step);
+                    section.insert(section.end(), buffer.begin(), buffer.end());
+                    buffer.erase(buffer.begin(), buffer.begin() + step);
+                    if (currentSamples < frameSize)
+                    {
+                        xmin = 0;
+                        xmax = (currentSamples) / samplingRate;
+                    }
+                    else
+                    {
+                        xmin = (currentSamples - frameSize) / samplingRate + 1;
+                        xmax = (currentSamples) / samplingRate;
+                    }
+
+                    if (graph == "ampl")
+                    {
+                        imgAmplitude = drawAmplitude(section);
+                        imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
+                        imshow("Display amplitude graph", imgAmplitude);
+                        waitKey(waitTime);
+                    }
+                    else if (graph == "spec")
+                    {
+                        stft = STFT(section);
+                        imgSpec = drawSpectrogram(stft);
+                        imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
+                        imshow("Display spectrogram", imgSpec);
+                        waitKey(waitTime);
+                    }
+                    else if (graph == "ampl_and_spec")
+                    {
+                        imgAmplitude = drawAmplitude(section);
+                        imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
+                        stft = STFT(section);
+                        imgSpec = drawSpectrogram(stft);
+                        imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
+                        imgTotal = concatenateImages(imgAmplitude, imgSpec);
+                        imshow("Display amplitude graph and spectrogram", imgTotal);
+                        waitKey(waitTime);
+                    }
+                }
+            }
+            else
+            {
+                break;
+            }
+        }
+
+    }
+
+    void dynamicMicrophone()
+    {
+        VideoCapture cap;
+        vector<int> params {    CAP_PROP_AUDIO_STREAM, 0,
+                                CAP_PROP_VIDEO_STREAM, -1   };
+
+        cap.open(0, CAP_MSMF, params);
+        if (!cap.isOpened())
+        {
+            cerr << "Error: Can't open microphone" << endl;
+            return;
+        }
+
+        const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
+        const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
+        int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
+        cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
+        cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
+        cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
+        cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
+
+        const double cvTickFreq = getTickFrequency();
+        int64 sysTimeCurr = getTickCount();
+        int64 sysTimePrev = sysTimeCurr;
+
+        int step = (updateTime * samplingRate);
+        int frameSize = (frameSizeTime * samplingRate);
+        // since the dimensional grid is counted in integer seconds,
+        // if duration of audio frame is less than xmarkup, to avoid an incorrect display,
+        // xmarkup will be taken equal to duration
+        if (frameSizeTime <= xmarkup)
+        {
+            xmarkup = frameSizeTime;
+        }
+
+        vector<int> frameVector;
+        vector<int> buffer;
+        vector<int> section(frameSize, 0);
+        Mat frame, imgAmplitude, imgSpec, imgTotal;
+
+        int currentSamples = 0;
+        vector<vector<double>> stft;
+        int xmin = 0;
+        int xmax = 0;
+        waitTime = updateTime * 1000;
+        while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime)
+        {
+            if (cap.grab())
+            {
+                cap.retrieve(frame, audioBaseIndex);
+                frameVector = frame;
+                buffer.insert(buffer.end(), frameVector.begin(), frameVector.end());
+                sysTimeCurr = getTickCount();
+
+                int bufferSize = static_cast<int>(buffer.size());
+                if (bufferSize >= step)
+                {
+                    currentSamples += step;
+                    section.erase(section.begin(), section.begin() + step);
+                    section.insert(section.end(), buffer.begin(), buffer.end());
+                    buffer.erase(buffer.begin(), buffer.begin() + step);
+
+                    if (currentSamples < frameSize)
+                    {
+                        xmin = 0;
+                        xmax = (currentSamples) / samplingRate;
+                    }
+                    else
+                    {
+                        xmin = (currentSamples - frameSize) / samplingRate + 1;
+                        xmax = (currentSamples) / samplingRate;
+                    }
+
+                    if (graph == "ampl")
+                    {
+                        imgAmplitude = drawAmplitude(section);
+                        imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
+                        imshow("Display amplitude graph", imgAmplitude);
+                        waitKey(waitTime);
+                    }
+                    else if (graph == "spec")
+                    {
+                        stft = STFT(section);
+                        imgSpec = drawSpectrogram(stft);
+                        imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
+                        imshow("Display spectrogram", imgSpec);
+                        waitKey(waitTime);
+                    }
+                    else if (graph == "ampl_and_spec")
+                    {
+                        imgAmplitude = drawAmplitude(section);
+                        imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
+                        stft = STFT(section);
+                        imgSpec = drawSpectrogram(stft);
+                        imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
+                        imgTotal = concatenateImages(imgAmplitude, imgSpec);
+                        imshow("Display amplitude graph and spectrogram", imgTotal);
+                        waitKey(waitTime);
+                    }
+                }
+            }
+            else
+            {
+                cerr << "Error: Grab error" << endl;
+                break;
+            }
+        }
+
+    }
+
+    bool initAndCheckArgs(const CommandLineParser& parser)
+    {
+        inputType = parser.get<string>("inputType");
+        if ((inputType != "file") && (inputType != "microphone"))
+        {
+            cout << "Error: " << inputType << " input method doesnt exist" << endl;
+            return false;
+        }
+
+        draw = parser.get<string>("draw");
+        if ((draw != "static") && (draw != "dynamic"))
+        {
+            cout << "Error: " << draw << " draw type doesnt exist" << endl;
+            return false;
+        }
+
+        graph = parser.get<string>("graph");
+        if ((graph != "ampl") && (graph != "spec") && (graph != "ampl_and_spec"))
+        {
+            cout << "Error: " << graph << " type of graph doesnt exist" << endl;
+            return false;
+        }
+
+        audio = samples::findFile(parser.get<std::string>("audio"));
+
+        audioStream = parser.get<int>("audioStream");
+        if (audioStream < 0)
+        {
+            cout << "Error: audioStream = " << audioStream << " - incorrect value. Must be >= 0" << endl;
+            return false;
+        }
+        windowType = parser.get<string>("windowType");
+        if ((windowType != "Rect") && (windowType != "Hann") && (windowType != "Hamming"))
+        {
+            cout << "Error: " << windowType << " type of window doesnt exist" << endl;
+            return false;
+        }
+
+        windLen = parser.get<int>("windLen");
+        if (windLen <= 0)
+        {
+            cout << "Error: windLen = " << windLen << " - incorrect value. Must be > 0" << endl;
+            return false;
+        }
+
+        overlap = parser.get<int>("overlap");
+        if (overlap <= 0)
+        {
+            cout << "Error: overlap = " << overlap << " - incorrect value. Must be > 0" << endl;
+            return false;
+        }
+
+        enableGrid = parser.get<bool>("enableGrid");
+
+        rows = parser.get<int>("rows");
+        if (rows <= 0)
+        {
+            cout << "Error: rows = " << rows << " - incorrect value. Must be > 0" << endl;
+            return false;
+        }
+        cols = parser.get<int>("cols");
+
+        if (cols <= 0)
+        {
+            cout << "Error: cols = " << cols << " - incorrect value. Must be > 0" << endl;
+            return false;
+        }
+        xmarkup = parser.get<int>("xmarkup");
+        if (xmarkup < 2)
+        {
+            cout << "Error: xmarkup = " << xmarkup << " - incorrect value. Must be >= 2" << endl;
+            return false;
+        }
+        ymarkup = parser.get<int>("ymarkup");
+        if (ymarkup < 2)
+        {
+            cout << "Error: ymarkup = " << ymarkup << " - incorrect value. Must be >= 2" << endl;
+            return false;
+        }
+        zmarkup = parser.get<int>("zmarkup");
+        if (zmarkup < 2)
+        {
+            cout << "Error: zmarkup = " << zmarkup << " - incorrect value. Must be >= 2" << endl;
+            return false;
+        }
+        microTime = parser.get<int>("microTime");
+        if (microTime <= 0)
+        {
+            cout << "Error: microTime = " << microTime << " - incorrect value. Must be > 0" << endl;
+            return false;
+        }
+        frameSizeTime = parser.get<int>("frameSizeTime");
+        if (frameSizeTime <= 0)
+        {
+            cout << "Error: frameSizeTime = " << frameSizeTime << " - incorrect value. Must be > 0" << endl;
+            return false;
+        }
+        updateTime = parser.get<int>("updateTime");
+        if (updateTime <= 0)
+        {
+            cout << "Error: updateTime = " << updateTime << " - incorrect value. Must be > 0" << endl;
+            return false;
+        }
+        waitTime = parser.get<int>("waitTime");
+        if (waitTime < 0)
+        {
+            cout << "Error: waitTime = " << waitTime << " - incorrect value. Must be >= 0" << endl;
+            return false;
+        }
+        return true;
+    }
+
+private :
+    string inputType;
+    string draw;
+    string graph;
+    string audio;
+    int audioStream;
+
+    string windowType;
+    int windLen;
+    int overlap;
+
+    bool enableGrid;
+
+    int rows;
+    int cols;
+
+    int xmarkup;
+    int ymarkup;
+    int zmarkup;
+
+    int microTime;
+    int frameSizeTime;
+    int updateTime;
+    int waitTime;
+
+};
+
+int main(int argc, char** argv)
+{
+    const String keys =
+        "{help h usage ? |               | this sample draws a volume graph and/or spectrogram of audio/video files and microphone \n\t\tDefault usage: ./Spectrogram.exe}"
+        "{inputType i    | file          | file or microphone                       }"
+        "{draw d         | static        | type of drawing: \n\t\t\tstatic - for plotting graph(s) across the entire input audio \n\t\t\tdynamic - for plotting graph(s) in a time-updating window}"
+        "{graph g        | ampl_and_spec | type of graph: amplitude graph or/and spectrogram. Please use tags below : \n\t\t\tampl - draw the amplitude graph \n\t\t\tspec - draw the spectrogram\n\t\t\tampl_and_spec - draw the amplitude graph and spectrogram on one image under each other}"
+        "{audio a        | Megamind.avi  | name and path to file                    }"
+        "{audioStream s  | 1             | CAP_PROP_AUDIO_STREAM value. Select audio stream number }"
+        "{windowType t   | Rect          | type of window for STFT. Please use tags below : \n\t\t\tRect/Hann/Hamming }"
+        "{windLen l      | 256           | size of window for STFT                  }"
+        "{overlap o      | 128           | overlap of windows for STFT              }"
+
+        "{enableGrid     | false         | grid on the amplitude graph              }"
+
+        "{rows r         | 400           | rows of output image                     }"
+        "{cols c         | 900           | cols of output image                     }"
+
+        "{xmarkup x      | 5             | number of x axis divisions (time asix)   }"
+        "{ymarkup y      | 5             | number of y axis divisions (frequency or/and amplitude axis) }"
+        "{zmarkup z      | 5             | number of z axis divisions (colorbar)    }"
+
+        "{microTime m    | 20            | time of recording audio with microphone in seconds }"
+        "{frameSizeTime f| 5             | size of sliding window in seconds        }"
+        "{updateTime u   | 1             | update time of sliding window in seconds }"
+        "{waitTime w     | 10            | parameter to cv.waitKey() for dynamic update of file input, takes values in milliseconds }"
+        ;
+
+    CommandLineParser parser(argc, argv, keys);
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    AudioDrawing draw(parser);
+    return 0;
+}
\ No newline at end of file
diff --git a/samples/python/audio_spectrogram.py b/samples/python/audio_spectrogram.py
new file mode 100644
index 0000000000..f211bbb584
--- /dev/null
+++ b/samples/python/audio_spectrogram.py
@@ -0,0 +1,804 @@
+import numpy as np
+import cv2 as cv
+import math
+import argparse
+
+class AudioDrawing:
+    '''
+        Used for drawing audio graphics
+    '''
+    def __init__(self, args):
+
+        self.inputType = args.inputType
+        self.draw = args.draw
+        self.graph = args.graph
+        self.audio = cv.samples.findFile(args.audio)
+        self.audioStream = args.audioStream
+
+        self.windowType = args.windowType
+        self.windLen = args.windLen
+        self.overlap = args.overlap
+
+        self.enableGrid = args.enableGrid
+
+        self.rows = args.rows
+        self.cols = args.cols
+
+        self.xmarkup = args.xmarkup
+        self.ymarkup = args.ymarkup
+        self.zmarkup = args.zmarkup
+
+        self.microTime = args.microTime
+        self.frameSizeTime = args.frameSizeTime
+        self.updateTime = args.updateTime
+        self.waitTime = args.waitTime
+
+        if self.initAndCheckArgs(args) is False:
+            exit()
+
+
+    def Draw(self):
+        if self.draw == "static":
+
+            if self.inputType == "file":
+                samplingRate, inputAudio = self.readAudioFile(self.audio)
+
+            elif self.inputType == "microphone":
+                samplingRate, inputAudio = self.readAudioMicrophone()
+
+            duration = len(inputAudio) // samplingRate
+
+            # since the dimensional grid is counted in integer seconds,
+            # if the input audio has an incomplete last second,
+            # then it is filled with zeros to complete
+            remainder = len(inputAudio) % samplingRate
+            if remainder != 0:
+                sizeToFullSec = samplingRate - remainder
+                zeroArr = np.zeros(sizeToFullSec)
+                inputAudio = np.concatenate((inputAudio, zeroArr), axis=0)
+                duration += 1
+                print("Update duration of audio to full second with ",
+                    sizeToFullSec, " zero samples")
+                print("New number of samples ", len(inputAudio))
+
+            if duration <= self.xmarkup:
+                self.xmarkup = duration + 1
+
+            if self.graph == "ampl":
+                imgAmplitude = self.drawAmplitude(inputAudio)
+                imgAmplitude = self.drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate)
+                cv.imshow("Display window", imgAmplitude)
+                cv.waitKey(0)
+
+            elif self.graph == "spec":
+                stft = self.STFT(inputAudio)
+                imgSpec = self.drawSpectrogram(stft)
+                imgSpec = self.drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft)
+                cv.imshow("Display window", imgSpec)
+                cv.waitKey(0)
+
+            elif self.graph == "ampl_and_spec":
+                imgAmplitude = self.drawAmplitude(inputAudio)
+                imgAmplitude = self.drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate)
+
+                stft = self.STFT(inputAudio)
+                imgSpec = self.drawSpectrogram(stft)
+                imgSpec = self.drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft)
+
+                imgTotal = self.concatenateImages(imgAmplitude, imgSpec)
+                cv.imshow("Display window", imgTotal)
+                cv.waitKey(0)
+
+        elif self.draw == "dynamic":
+
+            if self.inputType == "file":
+                self.dynamicFile(self.audio)
+
+            elif self.inputType == "microphone":
+                self.dynamicMicrophone()
+
+
+    def readAudioFile(self, file):
+        cap = cv.VideoCapture(file)
+
+        params = [cv.CAP_PROP_AUDIO_STREAM, self.audioStream,
+                cv.CAP_PROP_VIDEO_STREAM, -1,
+                cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_16S]
+        params = np.asarray(params)
+
+        cap.open(file, cv.CAP_ANY, params)
+        if cap.isOpened() == False:
+            print("Error : Can't read audio file: '", self.audio, "' with audioStream = ", self.audioStream)
+            print("Error: problems with audio reading, check input arguments")
+            exit()
+        audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
+        numberOfChannels = int(cap.get(cv.CAP_PROP_AUDIO_TOTAL_CHANNELS))
+
+        print("CAP_PROP_AUDIO_DATA_DEPTH: ", str((int(cap.get(cv.CAP_PROP_AUDIO_DATA_DEPTH)))))
+        print("CAP_PROP_AUDIO_SAMPLES_PER_SECOND: ", cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+        print("CAP_PROP_AUDIO_TOTAL_CHANNELS: ", numberOfChannels)
+        print("CAP_PROP_AUDIO_TOTAL_STREAMS: ", cap.get(cv.CAP_PROP_AUDIO_TOTAL_STREAMS))
+
+        frame = []
+        frame = np.asarray(frame)
+        inputAudio = []
+
+        while (1):
+            if (cap.grab()):
+                frame = []
+                frame = np.asarray(frame)
+                frame = cap.retrieve(frame, audioBaseIndex)
+                for i in range(len(frame[1][0])):
+                    inputAudio.append(frame[1][0][i])
+            else:
+                break
+
+        inputAudio = np.asarray(inputAudio)
+        print("Number of samples: ", len(inputAudio))
+        samplingRate = int(cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+        return samplingRate, inputAudio
+
+
+    def readAudioMicrophone(self):
+        cap = cv.VideoCapture()
+
+        params = [cv.CAP_PROP_AUDIO_STREAM, 0, cv.CAP_PROP_VIDEO_STREAM, -1]
+        params = np.asarray(params)
+
+        cap.open(0, cv.CAP_ANY, params)
+        if cap.isOpened() == False:
+            print("Error: Can't open microphone")
+            print("Error: problems with audio reading, check input arguments")
+            exit()
+        audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
+        numberOfChannels = int(cap.get(cv.CAP_PROP_AUDIO_TOTAL_CHANNELS))
+
+        print("CAP_PROP_AUDIO_DATA_DEPTH: ", str((int(cap.get(cv.CAP_PROP_AUDIO_DATA_DEPTH)))))
+        print("CAP_PROP_AUDIO_SAMPLES_PER_SECOND: ", cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+        print("CAP_PROP_AUDIO_TOTAL_CHANNELS: ", numberOfChannels)
+        print("CAP_PROP_AUDIO_TOTAL_STREAMS: ", cap.get(cv.CAP_PROP_AUDIO_TOTAL_STREAMS))
+
+        cvTickFreq = cv.getTickFrequency()
+        sysTimeCurr = cv.getTickCount()
+        sysTimePrev = sysTimeCurr
+
+        frame = []
+        frame = np.asarray(frame)
+        inputAudio = []
+
+        while ((sysTimeCurr - sysTimePrev) / cvTickFreq < self.microTime):
+            if (cap.grab()):
+                frame = []
+                frame = np.asarray(frame)
+                frame = cap.retrieve(frame, audioBaseIndex)
+                for i in range(len(frame[1][0])):
+                    inputAudio.append(frame[1][0][i])
+                sysTimeCurr = cv.getTickCount()
+            else:
+                print("Error: Grab error")
+                break
+
+        inputAudio = np.asarray(inputAudio)
+        print("Number of samples: ", len(inputAudio))
+        samplingRate = int(cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+
+        return samplingRate, inputAudio
+
+
+    def drawAmplitude(self, inputAudio):
+        color = (247, 111, 87)
+        thickness = 5
+        frameVectorRows = 500
+        middle = frameVectorRows // 2
+
+        # usually the input data is too big, so it is necessary
+        # to reduce size using interpolation of data
+        frameVectorCols = 40000
+        if len(inputAudio) < frameVectorCols:
+            frameVectorCols = len(inputAudio)
+
+        img = np.zeros((frameVectorRows, frameVectorCols, 3), np.uint8)
+        img += 255  # white background
+
+        audio = np.array(0)
+        audio = cv.resize(inputAudio, (1, frameVectorCols), interpolation=cv.INTER_LINEAR)
+        reshapeAudio = np.reshape(audio, (-1))
+
+        # normalization data by maximum element
+        minCv, maxCv, _, _ = cv.minMaxLoc(reshapeAudio)
+        maxElem = int(max(abs(minCv), abs(maxCv)))
+
+        # if all data values are zero (silence)
+        if maxElem == 0:
+            maxElem = 1
+        for i in range(len(reshapeAudio)):
+            reshapeAudio[i] = middle - reshapeAudio[i] * middle // maxElem
+
+        for i in range(1, frameVectorCols, 1):
+            cv.line(img, (i - 1, int(reshapeAudio[i - 1])), (i, int(reshapeAudio[i])), color, thickness)
+
+        img = cv.resize(img, (900, 400), interpolation=cv.INTER_AREA)
+        return img
+
+
+    def drawAmplitudeScale(self, inputImg, inputAudio, samplingRate, xmin=None, xmax=None):
+        # function of layout drawing for graph of volume amplitudes
+        # x axis for time
+        # y axis for amplitudes
+
+        # parameters for the new image size
+        preCol = 100
+        aftCol = 100
+        preLine = 40
+        aftLine = 50
+
+        frameVectorRows = inputImg.shape[0]
+        frameVectorCols = inputImg.shape[1]
+
+        totalRows = preLine + frameVectorRows + aftLine
+        totalCols = preCol + frameVectorCols + aftCol
+
+        imgTotal = np.zeros((totalRows, totalCols, 3), np.uint8)
+        imgTotal += 255  # white background
+        imgTotal[preLine: preLine + frameVectorRows, preCol: preCol + frameVectorCols] = inputImg
+
+        # calculating values on x axis
+        if xmin is None:
+            xmin = 0
+        if xmax is None:
+            xmax = len(inputAudio) / samplingRate
+
+        if xmax > self.xmarkup:
+            xList = np.linspace(xmin, xmax, self.xmarkup).astype(int)
+        else:
+            # this case is used to display a dynamic update
+            tmp = np.arange(xmin, xmax, 1).astype(int) + 1
+            xList = np.concatenate((np.zeros(self.xmarkup - len(tmp)), tmp[:]), axis=None)
+
+        # calculating values on y axis
+        ymin = np.min(inputAudio)
+        ymax = np.max(inputAudio)
+        yList = np.linspace(ymin, ymax, self.ymarkup)
+
+        # parameters for layout drawing
+        textThickness = 1
+        gridThickness = 1
+        gridColor = (0, 0, 0)
+        textColor = (0, 0, 0)
+        font = cv.FONT_HERSHEY_SIMPLEX
+        fontScale = 0.5
+
+        # horizontal axis under the graph
+        cv.line(imgTotal, (preCol, totalRows - aftLine),
+                (preCol + frameVectorCols, totalRows - aftLine),
+                gridColor, gridThickness)
+        # vertical axis for amplitude
+        cv.line(imgTotal, (preCol, preLine), (preCol, preLine + frameVectorRows),
+                gridColor, gridThickness)
+
+        # parameters for layout calculation
+        serifSize = 10
+        indentDownX = serifSize * 2
+        indentDownY = serifSize // 2
+        indentLeftX = serifSize
+        indentLeftY = 2 * preCol // 3
+
+        # drawing layout for x axis
+        numX = frameVectorCols // (self.xmarkup - 1)
+        for i in range(len(xList)):
+            a1 = preCol + i * numX
+            a2 = frameVectorRows + preLine
+            b1 = a1
+            b2 = a2 + serifSize
+            if self.enableGrid is True:
+                d1 = a1
+                d2 = preLine
+                cv.line(imgTotal, (a1, a2), (d1, d2), gridColor, gridThickness)
+            cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
+            cv.putText(imgTotal, str(int(xList[i])), (b1 - indentLeftX, b2 + indentDownX),
+                    font, fontScale, textColor, textThickness)
+
+        # drawing layout for y axis
+        numY = frameVectorRows // (self.ymarkup - 1)
+        for i in range(len(yList)):
+            a1 = preCol
+            a2 = totalRows - aftLine - i * numY
+            b1 = preCol - serifSize
+            b2 = a2
+            if self.enableGrid is True:
+                d1 = preCol + frameVectorCols
+                d2 = a2
+                cv.line(imgTotal, (a1, a2), (d1, d2), gridColor, gridThickness)
+            cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
+            cv.putText(imgTotal, str(int(yList[i])), (b1 - indentLeftY, b2 + indentDownY),
+                    font, fontScale, textColor, textThickness)
+        imgTotal = cv.resize(imgTotal, (self.cols, self.rows), interpolation=cv.INTER_AREA)
+        return imgTotal
+
+
+    def STFT(self, inputAudio):
+        """
+        The Short-time Fourier transform (STFT), is a Fourier-related transform used to determine
+        the sinusoidal frequency and phase content of local sections of a signal as it changes over
+        time.
+        In practice, the procedure for computing STFTs is to divide a longer time signal into
+        shorter segments of equal length and then compute the Fourier transform separately on each
+        shorter segment. This reveals the Fourier spectrum on each shorter segment. One then usually
+        plots the changing spectra as a function of time, known as a spectrogram or waterfall plot.
+
+        https://en.wikipedia.org/wiki/Short-time_Fourier_transform
+        """
+
+        time_step = self.windLen - self.overlap
+        stft = []
+
+        if self.windowType == "Hann":
+            # https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
+            Hann_wind = []
+            for i in range (1 - self.windLen, self.windLen, 2):
+                Hann_wind.append(i * (0.5 + 0.5 * math.cos(math.pi * i / (self.windLen - 1))))
+            Hann_wind = np.asarray(Hann_wind)
+
+        elif self.windowType == "Hamming":
+            # https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
+            Hamming_wind = []
+            for i in range (1 - self.windLen, self.windLen, 2):
+                Hamming_wind.append(i * (0.53836 - 0.46164 * (math.cos(2 * math.pi * i / (self.windLen - 1)))))
+            Hamming_wind = np.asarray(Hamming_wind)
+
+        for index in np.arange(0, len(inputAudio), time_step).astype(int):
+
+            section = inputAudio[index:index + self.windLen]
+            zeroArray = np.zeros(self.windLen - len(section))
+            section = np.concatenate((section, zeroArray), axis=None)
+
+            if self.windowType == "Hann":
+                section *= Hann_wind
+            elif self.windowType == "Hamming":
+                section *= Hamming_wind
+
+            dst = np.empty(0)
+            dst = cv.dft(section, dst, flags=cv.DFT_COMPLEX_OUTPUT)
+            reshape_dst = np.reshape(dst, (-1))
+            # we need only the first part of the spectrum, the second part is symmetrical
+            complexArr = np.zeros(len(dst) // 4, dtype=complex)
+            for i in range(len(dst) // 4):
+                complexArr[i] = complex(reshape_dst[2 * i], reshape_dst[2 * i + 1])
+            stft.append(np.abs(complexArr))
+
+        stft = np.array(stft).transpose()
+        # convert elements to the decibel scale
+        np.log10(stft, out=stft, where=(stft != 0.))
+        return 10 * stft
+
+
+    def drawSpectrogram(self, stft):
+
+        frameVectorRows = stft.shape[0]
+        frameVectorCols = stft.shape[1]
+
+        # Normalization of image values from 0 to 255 to get more contrast image
+        # and this normalization will be taken into account in the scale drawing
+        colormapImageRows = 255
+
+        imgSpec = np.zeros((frameVectorRows, frameVectorCols, 3), np.uint8)
+        stftMat = np.zeros((frameVectorRows, frameVectorCols), np.float64)
+        cv.normalize(stft, stftMat, 1.0, 0.0, cv.NORM_INF)
+
+        for i in range(frameVectorRows):
+            for j in range(frameVectorCols):
+                imgSpec[frameVectorRows - i - 1, j] = int(stftMat[i][j] * colormapImageRows)
+
+        imgSpec = cv.applyColorMap(imgSpec, cv.COLORMAP_INFERNO)
+        imgSpec = cv.resize(imgSpec, (900, 400), interpolation=cv.INTER_LINEAR)
+        return imgSpec
+
+
+    def drawSpectrogramColorbar(self, inputImg, inputAudio, samplingRate, stft, xmin=None, xmax=None):
+        # function of layout drawing for the three-dimensional graph of the spectrogram
+        # x axis for time
+        # y axis for frequencies
+        # z axis for magnitudes of frequencies shown by color scale
+
+        # parameters for the new image size
+        preCol = 100
+        aftCol = 100
+        preLine = 40
+        aftLine = 50
+        colColor = 20
+        ind_col = 20
+
+        frameVectorRows = inputImg.shape[0]
+        frameVectorCols = inputImg.shape[1]
+
+        totalRows = preLine + frameVectorRows + aftLine
+        totalCols = preCol + frameVectorCols + aftCol + colColor
+
+        imgTotal = np.zeros((totalRows, totalCols, 3), np.uint8)
+        imgTotal += 255  # white background
+        imgTotal[preLine: preLine + frameVectorRows, preCol: preCol + frameVectorCols] = inputImg
+
+        # colorbar image due to drawSpectrogram(..) picture has been normalised from 255 to 0,
+        # so here colorbar has values from 255 to 0
+        colorArrSize = 256
+        imgColorBar = np.zeros((colorArrSize, colColor, 1), np.uint8)
+
+        for i in range(colorArrSize):
+            imgColorBar[i] += colorArrSize - 1 - i
+
+        imgColorBar = cv.applyColorMap(imgColorBar, cv.COLORMAP_INFERNO)
+        imgColorBar = cv.resize(imgColorBar, (colColor, frameVectorRows), interpolation=cv.INTER_AREA)  #
+
+        imgTotal[preLine: preLine + frameVectorRows,
+        preCol + frameVectorCols + ind_col:
+        preCol + frameVectorCols + ind_col + colColor] = imgColorBar
+
+        # calculating values on x axis
+        if xmin is None:
+            xmin = 0
+        if xmax is None:
+            xmax = len(inputAudio) / samplingRate
+        if xmax > self.xmarkup:
+            xList = np.linspace(xmin, xmax, self.xmarkup).astype(int)
+        else:
+            # this case is used to display a dynamic update
+            tmpXList = np.arange(xmin, xmax, 1).astype(int) + 1
+            xList = np.concatenate((np.zeros(self.xmarkup - len(tmpXList)), tmpXList[:]), axis=None)
+
+        # calculating values on y axis
+        # according to the Nyquist sampling theorem,
+        # signal should posses frequencies equal to half of sampling rate
+        ymin = 0
+        ymax = int(samplingRate / 2.)
+        yList = np.linspace(ymin, ymax, self.ymarkup).astype(int)
+
+        # calculating values on z axis
+        zList = np.linspace(np.min(stft), np.max(stft), self.zmarkup)
+
+        # parameters for layout drawing
+        textThickness = 1
+        textColor = (0, 0, 0)
+        gridThickness = 1
+        gridColor = (0, 0, 0)
+        font = cv.FONT_HERSHEY_SIMPLEX
+        fontScale = 0.5
+
+        serifSize = 10
+        indentDownX = serifSize * 2
+        indentDownY = serifSize // 2
+        indentLeftX = serifSize
+        indentLeftY = 2 * preCol // 3
+
+        # horizontal axis
+        cv.line(imgTotal, (preCol, totalRows - aftLine), (preCol + frameVectorCols, totalRows - aftLine),
+                gridColor, gridThickness)
+        # vertical axis
+        cv.line(imgTotal, (preCol, preLine), (preCol, preLine + frameVectorRows),
+                gridColor, gridThickness)
+
+        # drawing layout for x axis
+        numX = frameVectorCols // (self.xmarkup - 1)
+        for i in range(len(xList)):
+            a1 = preCol + i * numX
+            a2 = frameVectorRows + preLine
+            b1 = a1
+            b2 = a2 + serifSize
+            cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
+            cv.putText(imgTotal, str(int(xList[i])), (b1 - indentLeftX, b2 + indentDownX),
+                    font, fontScale, textColor, textThickness)
+
+        # drawing layout for y axis
+        numY = frameVectorRows // (self.ymarkup - 1)
+        for i in range(len(yList)):
+            a1 = preCol
+            a2 = totalRows - aftLine - i * numY
+            b1 = preCol - serifSize
+            b2 = a2
+            cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
+            cv.putText(imgTotal, str(int(yList[i])), (b1 - indentLeftY, b2 + indentDownY),
+                    font, fontScale, textColor, textThickness)
+
+        # drawing layout for z axis
+        numZ = frameVectorRows // (self.zmarkup - 1)
+        for i in range(len(zList)):
+            a1 = preCol + frameVectorCols + ind_col + colColor
+            a2 = totalRows - aftLine - i * numZ
+            b1 = a1 + serifSize
+            b2 = a2
+            cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
+            cv.putText(imgTotal, str(int(zList[i])), (b1 + 10, b2 + indentDownY),
+                    font, fontScale, textColor, textThickness)
+        imgTotal = cv.resize(imgTotal, (self.cols, self.rows), interpolation=cv.INTER_AREA)
+        return imgTotal
+
+
+    def concatenateImages(self, img1, img2):
+        # first image will be under the second image
+        totalRows = img1.shape[0] + img2.shape[0]
+        totalCols = max(img1.shape[1], img2.shape[1])
+
+        # if images columns do not match, the difference is filled in white
+        imgTotal = np.zeros((totalRows, totalCols, 3), np.uint8)
+        imgTotal += 255
+
+        imgTotal[:img1.shape[0], :img1.shape[1]] = img1
+        imgTotal[img2.shape[0]:, :img2.shape[1]] = img2
+
+        return imgTotal
+
+
+    def dynamicFile(self, file):
+        cap = cv.VideoCapture(file)
+        params = [cv.CAP_PROP_AUDIO_STREAM, self.audioStream,
+                cv.CAP_PROP_VIDEO_STREAM, -1,
+                cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_16S]
+        params = np.asarray(params)
+
+        cap.open(file, cv.CAP_ANY, params)
+        if cap.isOpened() == False:
+            print("ERROR! Can't to open file")
+            return
+
+        audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
+        numberOfChannels = int(cap.get(cv.CAP_PROP_AUDIO_TOTAL_CHANNELS))
+        samplingRate = int(cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+
+        print("CAP_PROP_AUDIO_DATA_DEPTH: ", str((int(cap.get(cv.CAP_PROP_AUDIO_DATA_DEPTH)))))
+        print("CAP_PROP_AUDIO_SAMPLES_PER_SECOND: ", cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+        print("CAP_PROP_AUDIO_TOTAL_CHANNELS: ", numberOfChannels)
+        print("CAP_PROP_AUDIO_TOTAL_STREAMS: ", cap.get(cv.CAP_PROP_AUDIO_TOTAL_STREAMS))
+
+        step = int(self.updateTime * samplingRate)
+        frameSize = int(self.frameSizeTime * samplingRate)
+        # since the dimensional grid is counted in integer seconds,
+        # if duration of audio frame is less than xmarkup, to avoid an incorrect display,
+        # xmarkup will be taken equal to duration
+        if self.frameSizeTime <= self.xmarkup:
+            self.xmarkup = self.frameSizeTime
+
+        buffer = []
+        section = np.zeros(frameSize, dtype=np.int16)
+        currentSamples = 0
+
+        while (1):
+            if (cap.grab()):
+                frame = []
+                frame = np.asarray(frame)
+                frame = cap.retrieve(frame, audioBaseIndex)
+
+                for i in range(len(frame[1][0])):
+                    buffer.append(frame[1][0][i])
+
+                buffer_size = len(buffer)
+                if (buffer_size >= step):
+
+                    section = list(section)
+                    currentSamples += step
+
+                    del section[0:step]
+                    section.extend(buffer[0:step])
+                    del buffer[0:step]
+
+                    section = np.asarray(section)
+
+                    if currentSamples < frameSize:
+                        xmin = 0
+                        xmax = (currentSamples) / samplingRate
+                    else:
+                        xmin = (currentSamples - frameSize) / samplingRate + 1
+                        xmax = (currentSamples) / samplingRate
+
+                    if self.graph == "ampl":
+                        imgAmplitude = self.drawAmplitude(section)
+                        imgAmplitude = self.drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax)
+                        cv.imshow("Display amplitude graph", imgAmplitude)
+                        cv.waitKey(self.waitTime)
+
+                    elif self.graph == "spec":
+                        stft = self.STFT(section)
+                        imgSpec = self.drawSpectrogram(stft)
+                        imgSpec = self.drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax)
+                        cv.imshow("Display spectrogram", imgSpec)
+                        cv.waitKey(self.waitTime)
+
+                    elif self.graph == "ampl_and_spec":
+
+                        imgAmplitude = self.drawAmplitude(section)
+                        stft = self.STFT(section)
+                        imgSpec = self.drawSpectrogram(stft)
+
+                        imgAmplitude = self.drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax)
+                        imgSpec = self.drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax)
+
+                        imgTotal = self.concatenateImages(imgAmplitude, imgSpec)
+                        cv.imshow("Display amplitude graph and spectrogram", imgTotal)
+                        cv.waitKey(self.waitTime)
+            else:
+                break
+
+
+    def dynamicMicrophone(self):
+        cap = cv.VideoCapture()
+        params = [cv.CAP_PROP_AUDIO_STREAM, 0, cv.CAP_PROP_VIDEO_STREAM, -1]
+        params = np.asarray(params)
+
+        cap.open(0, cv.CAP_ANY, params)
+        if cap.isOpened() == False:
+            print("ERROR! Can't to open file")
+            return
+        audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
+        numberOfChannels = int(cap.get(cv.CAP_PROP_AUDIO_TOTAL_CHANNELS))
+
+        print("CAP_PROP_AUDIO_DATA_DEPTH: ", str((int(cap.get(cv.CAP_PROP_AUDIO_DATA_DEPTH)))))
+        print("CAP_PROP_AUDIO_SAMPLES_PER_SECOND: ", cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+        print("CAP_PROP_AUDIO_TOTAL_CHANNELS: ", numberOfChannels)
+        print("CAP_PROP_AUDIO_TOTAL_STREAMS: ", cap.get(cv.CAP_PROP_AUDIO_TOTAL_STREAMS))
+
+        frame = []
+        frame = np.asarray(frame)
+        samplingRate = int(cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+
+        step = int(self.updateTime * samplingRate)
+        frameSize = int(self.frameSizeTime * samplingRate)
+        self.xmarkup = self.frameSizeTime
+
+        currentSamples = 0
+
+        buffer = []
+        section = np.zeros(frameSize, dtype=np.int16)
+
+        cvTickFreq = cv.getTickFrequency()
+        sysTimeCurr = cv.getTickCount()
+        sysTimePrev = sysTimeCurr
+        self.waitTime = self.updateTime * 1000
+        while ((sysTimeCurr - sysTimePrev) / cvTickFreq < self.microTime):
+            if (cap.grab()):
+                frame = []
+                frame = np.asarray(frame)
+                frame = cap.retrieve(frame, audioBaseIndex)
+
+                for i in range(len(frame[1][0])):
+                    buffer.append(frame[1][0][i])
+
+                sysTimeCurr = cv.getTickCount()
+                buffer_size = len(buffer)
+                if (buffer_size >= step):
+
+                    section = list(section)
+                    currentSamples += step
+
+                    del section[0:step]
+                    section.extend(buffer[0:step])
+                    del buffer[0:step]
+
+                    section = np.asarray(section)
+
+                    if currentSamples < frameSize:
+                        xmin = 0
+                        xmax = (currentSamples) / samplingRate
+                    else:
+                        xmin = (currentSamples - frameSize) / samplingRate + 1
+                        xmax = (currentSamples) / samplingRate
+
+                    if self.graph == "ampl":
+                        imgAmplitude = self.drawAmplitude(section)
+                        imgAmplitude = self.drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax)
+                        cv.imshow("Display amplitude graph", imgAmplitude)
+                        cv.waitKey(self.waitTime)
+
+                    elif self.graph == "spec":
+                        stft = self.STFT(section)
+                        imgSpec = self.drawSpectrogram(stft)
+                        imgSpec = self.drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax)
+                        cv.imshow("Display spectrogram", imgSpec)
+                        cv.waitKey(self.waitTime)
+
+                    elif self.graph == "ampl_and_spec":
+                        imgAmplitude = self.drawAmplitude(section)
+                        stft = self.STFT(section)
+                        imgSpec = self.drawSpectrogram(stft)
+
+                        imgAmplitude = self.drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax)
+                        imgSpec = self.drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax)
+
+                        imgTotal = self.concatenateImages(imgAmplitude, imgSpec)
+                        cv.imshow("Display amplitude graph and spectrogram", imgTotal)
+                        cv.waitKey(self.waitTime)
+            else:
+                break
+
+
+    def initAndCheckArgs(self, args):
+        if args.inputType != "file" and args.inputType != "microphone":
+            print("Error: ", args.inputType, " input method doesnt exist")
+            return False
+        if args.draw != "static" and args.draw != "dynamic":
+            print("Error: ", args.draw, " draw type doesnt exist")
+            return False
+        if args.graph != "ampl" and args.graph != "spec" and args.graph != "ampl_and_spec":
+            print("Error: ", args.graph, " type of graph doesnt exist")
+            return False
+        if args.windowType != "Rect" and args.windowType != "Hann" and args.windowType != "Hamming":
+            print("Error: ", args.windowType, " type of window doesnt exist")
+            return False
+        if args.windLen <= 0:
+            print("Error: windLen = ", args.windLen, " - incorrect value. Must be > 0")
+            return False
+        if args.overlap <= 0:
+            print("Error: overlap = ", args.overlap, " - incorrect value. Must be > 0")
+            return False
+        if args.rows <= 0:
+            print("Error: rows = ", args.rows, " - incorrect value. Must be > 0")
+            return False
+        if args.cols <= 0:
+            print("Error: cols = ", args.cols, " - incorrect value. Must be > 0")
+            return False
+        if args.xmarkup < 2:
+            print("Error: xmarkup = ", args.xmarkup, " - incorrect value. Must be >= 2")
+            return False
+        if args.ymarkup < 2:
+            print("Error: ymarkup = ", args.ymarkup, " - incorrect value. Must be >= 2")
+            return False
+        if args.zmarkup < 2:
+            print("Error: zmarkup = ", args.zmarkup, " - incorrect value. Must be >= 2")
+            return False
+        if args.microTime <= 0:
+            print("Error: microTime = ", args.microTime, " - incorrect value. Must be > 0")
+            return False
+        if args.frameSizeTime <= 0:
+            print("Error: frameSizeTime = ", args.frameSizeTime, " - incorrect value. Must be > 0")
+            return False
+        if args.updateTime <= 0:
+            print("Error: updateTime = ", args.updateTime, " - incorrect value. Must be > 0")
+            return False
+        if args.waitTime < 0:
+            print("Error: waitTime = ", args.waitTime, " - incorrect value. Must be >= 0")
+            return False
+        return True
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
+                                     description='''this sample draws a volume graph and/or spectrogram of audio/video files and microphone\nDefault usage: ./Spectrogram.exe''')
+
+    parser.add_argument("-i", "--inputType", dest="inputType", type=str, default="file", help="file or microphone")
+    parser.add_argument("-d", "--draw", dest="draw", type=str, default="static",
+                        help="type of drawing: static - for plotting graph(s) across the entire input audio; dynamic - for plotting graph(s) in a time-updating window")
+    parser.add_argument("-g", "--graph", dest="graph", type=str, default="ampl_and_spec",
+                        help="type of graph: amplitude graph or/and spectrogram. Please use tags below : ampl - draw the amplitude graph; spec - draw the spectrogram; ampl_and_spec - draw the amplitude graph and spectrogram on one image under each other")
+
+    parser.add_argument("-a", "--audio", dest="audio", type=str, default='Megamind.avi',
+                        help="name and path to file")
+    parser.add_argument("-s", "--audioStream", dest="audioStream", type=int, default=1,
+                        help=" CAP_PROP_AUDIO_STREAM value")
+
+    parser.add_argument("-t", '--windowType', dest="windowType", type=str, default="Rect",
+                        help="type of window for STFT. Please use tags below : Rect/Hann/Hamming")
+    parser.add_argument("-l", '--windLen', dest="windLen", type=int, default=256, help="size of window for STFT")
+    parser.add_argument("-o", '--overlap', dest="overlap", type=int, default=128, help="overlap of windows for STFT")
+
+    parser.add_argument("-gd", '--grid', dest="enableGrid", type=bool, default=False, help="grid on amplitude graph(on/off)")
+
+    parser.add_argument("-r", '--rows', dest="rows", type=int, default=400, help="rows of output image")
+    parser.add_argument("-c", '--cols', dest="cols", type=int, default=900, help="cols of output image")
+
+    parser.add_argument("-x", '--xmarkup', dest="xmarkup", type=int, default=5,
+                        help="number of x axis divisions (time asix)")
+    parser.add_argument("-y", '--ymarkup', dest="ymarkup", type=int, default=5,
+                        help="number of y axis divisions (frequency or/and amplitude axis)")  # ?
+    parser.add_argument("-z", '--zmarkup', dest="zmarkup", type=int, default=5,
+                        help="number of z axis divisions (colorbar)")  # ?
+
+    parser.add_argument("-m", '--microTime', dest="microTime", type=int, default=20,
+                        help="time of recording audio with microphone in seconds")
+    parser.add_argument("-f", '--frameSizeTime', dest="frameSizeTime", type=int, default=5,
+                        help="size of sliding window in seconds")
+    parser.add_argument("-u", '--updateTime', dest="updateTime", type=int, default=1,
+                        help="update time of sliding window in seconds")
+    parser.add_argument("-w", '--waitTime', dest="waitTime", type=int, default=10,
+                        help="parameter to cv.waitKey() for dynamic update, takes values in milliseconds")
+
+    args = parser.parse_args()
+
+    AudioDrawing(args).Draw()