opencv/samples/cpp/audio_spectrogram.cpp

#include <opencv2/core.hpp>
#include <opencv2/videoio.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>

#include <iostream>
#include <vector>
#include <string>
#include <cmath>
using namespace cv;
using namespace std;


class AudioDrawing
{

public:

    AudioDrawing(const CommandLineParser& parser) {
        if (!initAndCheckArgs(parser))
        {
            cerr << "Error: Wrong input arguments" << endl;
            exit(0);
        }
        Draw();
    }

    void Draw() {
        if (draw == "static")
        {
            vector<int>inputAudio = {};
            int samplingRate = 0;
            if (inputType == "file")
            {
                samplingRate = readAudioFile(audio, inputAudio);
            }
            else if (inputType == "microphone")
            {
                samplingRate = readAudioMicrophone(inputAudio);
            }
            if ((inputAudio.size() == 0) || samplingRate <= 0)
            {
                cerr << "Error: problems with audio reading, check input arguments" << endl;
                return;
            }

            int duration = static_cast<int>(inputAudio.size()) / samplingRate;

            // since the dimensional grid is counted in integer seconds,
            // if the input audio has an incomplete last second,
            // then it is filled with zeros to complete
            int remainder = static_cast<int>(inputAudio.size()) % samplingRate;
            if (remainder)
            {
                int sizeToFullSec = samplingRate - remainder;
                for (int j = 0; j < sizeToFullSec; ++j)
                {
                    inputAudio.push_back(0);
                }
                duration += 1;
                cout << "Update duration of audio to full last second with " <<
                        sizeToFullSec << " zero samples" << endl;
                cout << "New number of samples " << inputAudio.size() << endl;
            }
            cout << "Duration of audio = " << duration << " seconds" << endl;

            // since the dimensional grid is counted in integer seconds,
            // if duration of file is less than xmarkup, to avoid an incorrect display,
            // xmarkup will be taken equal to duration
            if (duration <= xmarkup)
            {
                xmarkup = duration + 1;
            }

            if (graph == "ampl")
            {
                Mat imgAmplitude = drawAmplitude(inputAudio);
                imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate);
                imshow("Display amplitude graph", imgAmplitude);
                waitKey(0);
            }
            else if (graph == "spec")
            {
                vector<vector<double>>stft = STFT(inputAudio);
                Mat imgSpec = drawSpectrogram(stft);
                imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft);
                imshow("Display spectrogram", imgSpec);
                waitKey(0);
            }
            else if (graph == "ampl_and_spec")
            {
                Mat imgAmplitude = drawAmplitude(inputAudio);
                imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate);
                vector<vector<double>>stft = STFT(inputAudio);
                Mat imgSpec = drawSpectrogram(stft);
                imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft);
                Mat imgTotal = concatenateImages(imgAmplitude, imgSpec);
                imshow("Display amplitude graph and spectrogram", imgTotal);
                waitKey(0);
            }
        }
        else if (draw == "dynamic")
        {
            if (inputType == "file")
            {
                dynamicFile(audio);
            }
            else if (inputType == "microphone")
            {
                dynamicMicrophone();
            }
        }
    }

    ~AudioDrawing() {
    }

    int readAudioFile(string file, vector<int>& inputAudio)
    {
        VideoCapture cap;
        vector<int> params {    CAP_PROP_AUDIO_STREAM, audioStream,
                                CAP_PROP_VIDEO_STREAM, -1,
                                CAP_PROP_AUDIO_DATA_DEPTH, CV_16S   };

        cap.open(file, CAP_ANY, params);
        if (!cap.isOpened())
        {
            cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl;
            return -1;
        }
        const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX);
        const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
        cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl;
        int samplingRate =  static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
        cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
        cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
        cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;

        vector<int> frameVec;
        Mat frame;
        for (;;)
        {
            if (cap.grab())
            {
                cap.retrieve(frame, audioBaseIndex);
                frameVec = frame;
                inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end());
            }
            else
            {
                cout << "Number of samples: " << inputAudio.size() << endl;
                break;
            }
        }
        return samplingRate;
    }

    int readAudioMicrophone(vector<int>& inputAudio)
    {
        VideoCapture cap;
        vector<int> params {    CAP_PROP_AUDIO_STREAM, 0,
                                CAP_PROP_VIDEO_STREAM, -1   };

        cap.open(0, CAP_ANY, params);
        if (!cap.isOpened())
        {
            cerr << "Error: Can't open microphone" << endl;
            return -1;
        }

        const int audioBaseIndex =  static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
        const int numberOfChannels =  static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
        cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString( static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
        int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
        cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << samplingRate << endl;
        cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
        cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;

        const double cvTickFreq = getTickFrequency();
        int64 sysTimeCurr = getTickCount();
        int64 sysTimePrev = sysTimeCurr;

        vector<int> frameVec;
        Mat frame;
        while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime)
        {
            if (cap.grab())
            {
                cap.retrieve(frame, audioBaseIndex);
                frameVec = frame;
                inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end());
                sysTimeCurr = getTickCount();
            }
            else
            {
                cerr << "Error: Grab error" << endl;
                break;
            }
        }
        cout << "Number of samples: " << inputAudio.size() << endl;
        return samplingRate;
    }


    Mat drawAmplitude(vector<int>& inputAudio)
    {
        Scalar color = Scalar(247,111,87);
        int thickness = 5;
        int frameVectorRows = 500;
        int middle = frameVectorRows / 2;
        // usually the input data is too big, so it is necessary
        // to reduce size using interpolation of data
        int frameVectorCols = 40000;
        if (static_cast<int>(inputAudio.size()) < frameVectorCols)
        {
            frameVectorCols = static_cast<int>(inputAudio.size());
        }

        Mat img(frameVectorRows, frameVectorCols, CV_8UC3 , Scalar(255,255,255)); // white background

        vector<double>reshapeAudio(inputAudio.size());
        for (size_t i = 0; i < inputAudio.size(); ++i)
        {
            reshapeAudio[i]=static_cast<double>(inputAudio[i]);
        }

        Mat img_frameVector( 1, static_cast<int>(reshapeAudio.size()), CV_64F , reshapeAudio.data());
        Mat img_frameVector_resize;
        resize(img_frameVector, img_frameVector_resize, Size(frameVectorCols, 1), INTER_LINEAR);
        reshapeAudio = img_frameVector_resize;

        // normalization data by maximum element
        normalize(reshapeAudio, reshapeAudio, 1.0, 0.0, NORM_INF);

        for (size_t i = 0; i < reshapeAudio.size(); ++i)
        {
            reshapeAudio[i] = middle - reshapeAudio[i] * middle;
        }

        for (int i = 1; i < static_cast<int>(reshapeAudio.size()); ++i)
        {
            line(img, Point(i-1, static_cast<int>(reshapeAudio[i-1])), Point(i, static_cast<int>(reshapeAudio[i])), color, thickness);
        }
        Mat resImage;
        resize(img, resImage, Size(900, 400), INTER_AREA );
        return resImage;
    }

    Mat drawAmplitudeScale(Mat& inputImg, const vector<int>& inputAudio, int samplingRate,
                           int xmin = 0, int xmax = 0)
    {
        // function of layout drawing for graph of volume amplitudes
        // x axis for time
        // y axis for amplitudes

        // parameters for the new image size
        int preCol = 100;
        int aftCol = 100;
        int preLine = 40;
        int aftLine = 50;

        int frameVectorRows = inputImg.rows;
        int frameVectorCols = inputImg.cols;

        int totalRows = preLine + frameVectorRows + aftLine;
        int totalCols = preCol + frameVectorCols + aftCol;

        Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3, Scalar(255, 255, 255));
        inputImg.copyTo(imgTotal(Rect(preCol, preLine, inputImg.cols, inputImg.rows)));


        // calculating values on x axis
        if (xmax == 0)
        {
            xmax = static_cast<int>(inputAudio.size()) / samplingRate;
        }
        std::vector<double> xList(xmarkup);
        if (xmax >= xmarkup)
        {
            double deltax = (xmax - xmin) / (xmarkup - 1);
            for (int i = 0; i < xmarkup; ++i)
            {
                xList[i] = (xmin + deltax * i);
            }
        }
        else
        {
            // this case is used to display a dynamic update
            vector<double> tmpXList;
            for (int i = xmin; i < xmax; ++i)
            {
                tmpXList.push_back(i + 1);
            }
            int k = 0;
            for (int i = xmarkup - static_cast<int>(tmpXList.size()); i < xmarkup; ++i)
            {
                xList[i] = tmpXList[k];
                k += 1;
            }
        }

        // calculating values on y axis
        double minCv; double maxCv; Point minLoc; Point maxLoc;
        minMaxLoc(inputAudio, &minCv, &maxCv, &minLoc, &maxLoc);
        int ymin = static_cast<int>(minCv);
        int ymax = static_cast<int>(maxCv);

        std::vector<double> yList(ymarkup);
        double deltay = (ymax - ymin) / (ymarkup - 1);
        for (int i = 0; i < ymarkup; ++i)
        {
            yList[i] = ymin + deltay * i;
        }

        // parameters for layout drawing
        int textThickness = 1;
        int gridThickness = 1;
        Scalar gridColor(0, 0, 0);
        Scalar textColor(0, 0, 0);
        float fontScale = 0.5;

        // horizontal axis
        line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine),
            gridColor, gridThickness);
        // vertical axis
        line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows),
            gridColor, gridThickness);

        // parameters for layout calculation
        int serifSize = 10;
        int indentDownX = serifSize * 2;
        int indentDownY = serifSize / 2;
        int indentLeftX = serifSize;
        int indentLeftY = 2 * preCol / 3;


        // drawing layout for x axis
        int numX = frameVectorCols / (xmarkup - 1);
        for (size_t i = 0; i < xList.size(); ++i)
        {
            int a1 = static_cast<int>(preCol + i * numX);
            int a2 = frameVectorRows + preLine;

            int b1 = a1;
            int b2 = a2 + serifSize;

            if (enableGrid)
            {
                int d1 = a1;
                int d2 = preLine;
                line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness);
            }
            line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
            putText(imgTotal, to_string(int(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX),
                    FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
        }

        // drawing layout for y axis
        int numY = frameVectorRows / (ymarkup - 1);
        for (size_t i = 0; i < yList.size(); ++i) {
            int a1 = preCol;
            int a2 = static_cast<int>(totalRows - aftLine - i * numY);
            int b1 = preCol - serifSize;
            int b2 = a2;
            if (enableGrid)
            {
                int d1 = preCol + frameVectorCols;
                int d2 = a2;
                line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness);
            }
            line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
            putText(imgTotal, to_string(int(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY),
                    FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
        }
        Mat resImage;
        resize(imgTotal, resImage, Size(cols, rows), INTER_AREA );
        return resImage;
    }

    vector<vector<double>> STFT(const vector<int>& inputAudio)
    {
        // The Short-time Fourier transform (STFT), is a Fourier-related transform used to
        // determine the sinusoidal frequency and phase content of local sections of a signal
        // as it changes over time.
        // In practice, the procedure for computing STFTs is to divide a longer time signal
        // into shorter segments of equal length and then compute the Fourier transform separately
        // on each shorter segment. This reveals the Fourier spectrum on each shorter segment.
        // One then usually plots the changing spectra as a function of time, known as a spectrogram
        // or waterfall plot.
        // https://en.wikipedia.org/wiki/Short-time_Fourier_transform

        int timeStep = windLen - overlap;
        Mat dstMat;
        vector<double> stftRow;
        vector<double> WindType;
        if (windowType == "Hann")
        {
            // https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
            for (int j = 1 - windLen; j < windLen; j+=2)
            {
                WindType.push_back(j * (0.5 * (1 - cos(CV_PI * j / (windLen - 1)))));
            }
        }
        else if (windowType == "Hamming")
        {
            // https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
            for (int j = 1 - windLen; j < windLen; j+=2)
            {
                WindType.push_back(j * (0.53836 - 0.46164 * (cos(CV_PI * j / (windLen - 1)))));
            }
        }
        for (size_t i = 0; i < inputAudio.size(); i += timeStep)
        {
            vector<double>section(windLen, 0);
            for (int j = 0; j < windLen; ++j)
            {
                section[j] = inputAudio[j + i];
            }
            if (windowType == "Hann" || windowType == "Hamming")
            {
                for (size_t j = 0; j < section.size(); ++j)
                {
                    section[j] *= WindType[j];
                }
            }

            dft(section, dstMat, DFT_COMPLEX_OUTPUT);

            for (int j = 0; j < dstMat.cols / 4; ++j)
            {
                double complModule = sqrt(dstMat.at<double>(2*j) * dstMat.at<double>(2*j) +
                                        dstMat.at<double>(2*j+1) * dstMat.at<double>(2*j+1));
                stftRow.push_back(complModule);
            }
        }

        size_t xSize = inputAudio.size() / timeStep + 1;
        // we need only the first part of the spectrum, the second part is symmetrical
        size_t ySize = dstMat.cols / 4;

        vector<vector<double>> stft(ySize, vector<double>(xSize, 0.));
        for (size_t i = 0; i < xSize; ++i)
        {
            for (size_t j = 0; j < ySize; ++j)
            {
                // write elements with transposition and convert it to the decibel scale
                double stftElem = stftRow[ i * ySize + j];
                if (stftElem != 0.)
                {
                    stft[j][i] = 10 * log10(stftElem);
                }
            }
        }
        return stft;
    }

    Mat drawSpectrogram(const vector<vector<double>>& stft)
    {
        int frameVectorRows = static_cast<int>(stft.size());
        int frameVectorCols = static_cast<int>(stft[0].size());

        // Normalization of image values from 0 to 255 to get more contrast image
        // and this normalization will be taken into account in the scale drawing
        int colormapImageRows = 255;

        double minCv; double maxCv; Point minLoc; Point maxLoc;
        minMaxLoc(stft[0], &minCv, &maxCv, &minLoc, &maxLoc);
        double maxStft = max(abs(maxCv), abs(minCv));

        for (int i = 1; i < frameVectorRows; ++i)
        {
            minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc);
            maxStft = max(maxStft, max(abs(maxCv), abs(minCv)));
        }
        // if maxStft is zero (silence)
        if (maxStft == 0.)
        {
            maxStft = 1;
        }
        Mat imgSpec(frameVectorRows, frameVectorCols, CV_8UC1, Scalar(255, 255, 255));

        for (int i = 0; i < frameVectorRows; ++i)
        {
            for (int j = 0; j < frameVectorCols; ++j)
            {
                imgSpec.at<uchar>(frameVectorRows - i - 1, j) = static_cast<uchar>(stft[i][j] * colormapImageRows / maxStft);
            }
        }
        applyColorMap(imgSpec, imgSpec, COLORMAP_INFERNO);
        Mat resImage;
        resize(imgSpec, resImage, Size(900, 400), INTER_AREA);
        return resImage;
    }

    Mat drawSpectrogramColorbar(Mat& inputImg, const vector<int>& inputAudio,
                                int samplingRate, const vector<vector<double>>& stft,
                                int xmin = 0, int xmax = 0)
    {
        // function of layout drawing for the three-dimensional graph of the spectrogram
        // x axis for time
        // y axis for frequencies
        // z axis for magnitudes of frequencies shown by color scale

        // parameters for the new image size
        int preCol = 100;
        int aftCol = 100;
        int preLine = 40;
        int aftLine = 50;
        int colColor = 20;
        int indCol = 20;

        int frameVectorRows = inputImg.rows;
        int frameVectorCols = inputImg.cols;

        int totalRows = preLine + frameVectorRows + aftLine;
        int totalCols = preCol + frameVectorCols + aftCol;

        Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255));
        inputImg.copyTo(imgTotal(Rect(preCol, preLine, frameVectorCols, frameVectorRows)));

        // colorbar image due to drawSpectrogram(..) picture has been normalised from 255 to 0,
        // so here colorbar has values from 255 to 0
        int colorArrSize = 256;
        Mat imgColorBar = Mat (colorArrSize, colColor, CV_8UC1 , Scalar(255,255,255));
        for (int i = 0; i < colorArrSize; ++i)
        {
            for( int j = 0; j < colColor; ++j)
            {
                imgColorBar.at<uchar>(i, j) = static_cast<uchar>(colorArrSize - 1 - i); // from 255 to 0
            }
        }

        applyColorMap(imgColorBar, imgColorBar, COLORMAP_INFERNO);
        resize(imgColorBar, imgColorBar, Size(colColor, frameVectorRows), INTER_AREA);
        imgColorBar.copyTo(imgTotal(Rect(preCol + frameVectorCols + indCol, preLine, colColor, frameVectorRows)));


        // calculating values on x axis
        if (xmax == 0)
        {
            xmax = static_cast<int>(inputAudio.size()) / samplingRate + 1;
        }
        vector<double> xList(xmarkup, 0);
        if (xmax >= xmarkup)
        {
            double deltax = (xmax - xmin) / (xmarkup - 1);
            for(int i = 0; i < xmarkup; ++i)
            {
                xList[i] = xmin + deltax * i;
            }
        }
        else
        {
            // this case is used to display a dynamic update
            vector<double> tmpXList;
            for(int i = xmin; i < xmax; ++i)
            {
                tmpXList.push_back(i + 1);
            }
            int k = 0;
            for (int i = xmarkup - static_cast<int>(tmpXList.size()); i < xmarkup; ++i)
            {
                xList[i] = tmpXList[k];
                k += 1;
            }
        }

        // calculating values on y axis
        // according to the Nyquist sampling theorem,
        // signal should posses frequencies equal to half of sampling rate
        int ymin = 0;
        int ymax = static_cast<int>(samplingRate / 2);

        vector<double> yList;
        double deltay = (ymax - ymin) / (ymarkup - 1);
        for(int i = 0; i < ymarkup; ++i)
        {
            yList.push_back(ymin + deltay * i);
        }

        // calculating values on z axis
        double minCv; double maxCv; Point minLoc; Point maxLoc;
        minMaxLoc( stft[0], &minCv, &maxCv, &minLoc, &maxLoc);
        double zmin = minCv, zmax = maxCv;

        std::vector<double> zList;
        for (size_t i = 1; i < stft.size(); ++i)
        {
            minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc);
            zmax = max(zmax, maxCv);
            zmin = min(zmin, minCv);
        }
        double deltaz = (zmax - zmin) / (zmarkup - 1);
        for(int i = 0; i < zmarkup; ++i)
        {
            zList.push_back(zmin + deltaz * i);
        }

        // parameters for layout drawing
        int textThickness = 1;
        int gridThickness = 1;
        Scalar gridColor(0,0,0);
        Scalar textColor(0,0,0);
        float fontScale = 0.5;

        int serifSize = 10;
        int indentDownX = serifSize * 2;
        int indentDownY = serifSize / 2;
        int indentLeftX = serifSize;
        int indentLeftY = 2 * preCol / 3;

        // horizontal axis
        line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine),
                            gridColor, gridThickness);
        // vertical axis
        line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows),
                            gridColor, gridThickness);

        // drawing layout for x axis
        int numX = frameVectorCols / (xmarkup - 1);
        for (size_t i = 0; i < xList.size(); ++i)
        {
            int a1 = static_cast<int>(preCol + i * numX);
            int a2 = frameVectorRows + preLine;

            int b1 = a1;
            int b2 = a2 + serifSize;

            line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
            putText(imgTotal, to_string(static_cast<int>(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX),
                    FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
        }

        // drawing layout for y axis
        int numY = frameVectorRows / (ymarkup - 1);
        for (size_t i = 0; i < yList.size(); ++i)
        {
            int a1 = preCol;
            int a2 = static_cast<int>(totalRows - aftLine - i * numY);

            int b1 = preCol - serifSize;
            int b2 = a2;

            line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
            putText(imgTotal, to_string(static_cast<int>(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY),
                    FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
        }

        // drawing layout for z axis
        int numZ = frameVectorRows / (zmarkup - 1);
        for (size_t i = 0; i < zList.size(); ++i)
        {
            int a1 = preCol + frameVectorCols + indCol + colColor;
            int a2 = static_cast<int>(totalRows - aftLine - i * numZ);

            int b1 = a1 + serifSize;
            int b2 = a2;

            line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
            putText(imgTotal, to_string(static_cast<int>(zList[i])), Point(b1 + 10, b2 + indentDownY),
                    FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
        }
        Mat resImage;
        resize(imgTotal, resImage, Size(cols, rows), INTER_AREA );
        return resImage;
    }

    Mat concatenateImages(Mat& img1, Mat& img2)
    {
        // first image will be under the second image
        int totalRows = img1.rows + img2.rows;
        int totalCols = max(img1.cols , img2.cols);
        // if images columns do not match, the difference is filled in white
        Mat imgTotal = Mat (totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255));

        img1.copyTo(imgTotal(Rect(0, 0, img1.cols, img1.rows)));
        img2.copyTo(imgTotal(Rect(0, img1.rows, img2.cols, img2.rows)));
        return imgTotal;
    }

    void dynamicFile(const string file)
    {
        VideoCapture cap;
        vector<int> params {    CAP_PROP_AUDIO_STREAM, audioStream,
                                CAP_PROP_VIDEO_STREAM, -1,
                                CAP_PROP_AUDIO_DATA_DEPTH, CV_16S   };

        cap.open(file, CAP_ANY, params);
        if (!cap.isOpened())
        {
            cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl;
            return;
        }

        const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
        const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
        int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));

        cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
        cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
        cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
        cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;

        int step = static_cast<int>(updateTime * samplingRate);
        int frameSize = static_cast<int>(frameSizeTime * samplingRate);

        // since the dimensional grid is counted in integer seconds,
        // if duration of audio frame is less than xmarkup, to avoid an incorrect display,
        // xmarkup will be taken equal to duration
        if (frameSizeTime <= xmarkup)
        {
            xmarkup = frameSizeTime;
        }

        vector<int> buffer;
        vector<int> frameVector;
        vector<int> section(frameSize, 0);
        vector<vector<double>>stft;
        Mat frame, imgAmplitude, imgSpec, imgTotal;
        int currentSamples = 0;
        int xmin = 0;
        int xmax = 0;

        for (;;)
        {
            if (cap.grab())
            {
                cap.retrieve(frame, audioBaseIndex);
                frameVector = frame;
                buffer.insert(buffer.end(), frameVector.begin(), frameVector.end());
                int bufferSize = static_cast<int>(buffer.size());
                if (bufferSize >= step)
                {
                    currentSamples += bufferSize;
                    section.erase(section.begin(), section.begin() + step);
                    section.insert(section.end(), buffer.begin(), buffer.end());
                    buffer.erase(buffer.begin(), buffer.begin() + step);
                    if (currentSamples < frameSize)
                    {
                        xmin = 0;
                        xmax = (currentSamples) / samplingRate;
                    }
                    else
                    {
                        xmin = (currentSamples - frameSize) / samplingRate + 1;
                        xmax = (currentSamples) / samplingRate;
                    }

                    if (graph == "ampl")
                    {
                        imgAmplitude = drawAmplitude(section);
                        imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
                        imshow("Display amplitude graph", imgAmplitude);
                        waitKey(waitTime);
                    }
                    else if (graph == "spec")
                    {
                        stft = STFT(section);
                        imgSpec = drawSpectrogram(stft);
                        imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
                        imshow("Display spectrogram", imgSpec);
                        waitKey(waitTime);
                    }
                    else if (graph == "ampl_and_spec")
                    {
                        imgAmplitude = drawAmplitude(section);
                        imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
                        stft = STFT(section);
                        imgSpec = drawSpectrogram(stft);
                        imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
                        imgTotal = concatenateImages(imgAmplitude, imgSpec);
                        imshow("Display amplitude graph and spectrogram", imgTotal);
                        waitKey(waitTime);
                    }
                }
            }
            else
            {
                break;
            }
        }

    }

    void dynamicMicrophone()
    {
        VideoCapture cap;
        vector<int> params {    CAP_PROP_AUDIO_STREAM, 0,
                                CAP_PROP_VIDEO_STREAM, -1   };

        cap.open(0, CAP_MSMF, params);
        if (!cap.isOpened())
        {
            cerr << "Error: Can't open microphone" << endl;
            return;
        }

        const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
        const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
        int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
        cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
        cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
        cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
        cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;

        const double cvTickFreq = getTickFrequency();
        int64 sysTimeCurr = getTickCount();
        int64 sysTimePrev = sysTimeCurr;

        int step = (updateTime * samplingRate);
        int frameSize = (frameSizeTime * samplingRate);
        // since the dimensional grid is counted in integer seconds,
        // if duration of audio frame is less than xmarkup, to avoid an incorrect display,
        // xmarkup will be taken equal to duration
        if (frameSizeTime <= xmarkup)
        {
            xmarkup = frameSizeTime;
        }

        vector<int> frameVector;
        vector<int> buffer;
        vector<int> section(frameSize, 0);
        Mat frame, imgAmplitude, imgSpec, imgTotal;

        int currentSamples = 0;
        vector<vector<double>> stft;
        int xmin = 0;
        int xmax = 0;
        waitTime = updateTime * 1000;
        while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime)
        {
            if (cap.grab())
            {
                cap.retrieve(frame, audioBaseIndex);
                frameVector = frame;
                buffer.insert(buffer.end(), frameVector.begin(), frameVector.end());
                sysTimeCurr = getTickCount();

                int bufferSize = static_cast<int>(buffer.size());
                if (bufferSize >= step)
                {
                    currentSamples += step;
                    section.erase(section.begin(), section.begin() + step);
                    section.insert(section.end(), buffer.begin(), buffer.end());
                    buffer.erase(buffer.begin(), buffer.begin() + step);

                    if (currentSamples < frameSize)
                    {
                        xmin = 0;
                        xmax = (currentSamples) / samplingRate;
                    }
                    else
                    {
                        xmin = (currentSamples - frameSize) / samplingRate + 1;
                        xmax = (currentSamples) / samplingRate;
                    }

                    if (graph == "ampl")
                    {
                        imgAmplitude = drawAmplitude(section);
                        imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
                        imshow("Display amplitude graph", imgAmplitude);
                        waitKey(waitTime);
                    }
                    else if (graph == "spec")
                    {
                        stft = STFT(section);
                        imgSpec = drawSpectrogram(stft);
                        imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
                        imshow("Display spectrogram", imgSpec);
                        waitKey(waitTime);
                    }
                    else if (graph == "ampl_and_spec")
                    {
                        imgAmplitude = drawAmplitude(section);
                        imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
                        stft = STFT(section);
                        imgSpec = drawSpectrogram(stft);
                        imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
                        imgTotal = concatenateImages(imgAmplitude, imgSpec);
                        imshow("Display amplitude graph and spectrogram", imgTotal);
                        waitKey(waitTime);
                    }
                }
            }
            else
            {
                cerr << "Error: Grab error" << endl;
                break;
            }
        }

    }

    bool initAndCheckArgs(const CommandLineParser& parser)
    {
        inputType = parser.get<string>("inputType");
        if ((inputType != "file") && (inputType != "microphone"))
        {
            cout << "Error: " << inputType << " input method doesnt exist" << endl;
            return false;
        }

        draw = parser.get<string>("draw");
        if ((draw != "static") && (draw != "dynamic"))
        {
            cout << "Error: " << draw << " draw type doesnt exist" << endl;
            return false;
        }

        graph = parser.get<string>("graph");
        if ((graph != "ampl") && (graph != "spec") && (graph != "ampl_and_spec"))
        {
            cout << "Error: " << graph << " type of graph doesnt exist" << endl;
            return false;
        }

        audio = samples::findFile(parser.get<std::string>("audio"));

        audioStream = parser.get<int>("audioStream");
        if (audioStream < 0)
        {
            cout << "Error: audioStream = " << audioStream << " - incorrect value. Must be >= 0" << endl;
            return false;
        }
        windowType = parser.get<string>("windowType");
        if ((windowType != "Rect") && (windowType != "Hann") && (windowType != "Hamming"))
        {
            cout << "Error: " << windowType << " type of window doesnt exist" << endl;
            return false;
        }

        windLen = parser.get<int>("windLen");
        if (windLen <= 0)
        {
            cout << "Error: windLen = " << windLen << " - incorrect value. Must be > 0" << endl;
            return false;
        }

        overlap = parser.get<int>("overlap");
        if (overlap <= 0)
        {
            cout << "Error: overlap = " << overlap << " - incorrect value. Must be > 0" << endl;
            return false;
        }

        enableGrid = parser.get<bool>("enableGrid");

        rows = parser.get<int>("rows");
        if (rows <= 0)
        {
            cout << "Error: rows = " << rows << " - incorrect value. Must be > 0" << endl;
            return false;
        }
        cols = parser.get<int>("cols");

        if (cols <= 0)
        {
            cout << "Error: cols = " << cols << " - incorrect value. Must be > 0" << endl;
            return false;
        }
        xmarkup = parser.get<int>("xmarkup");
        if (xmarkup < 2)
        {
            cout << "Error: xmarkup = " << xmarkup << " - incorrect value. Must be >= 2" << endl;
            return false;
        }
        ymarkup = parser.get<int>("ymarkup");
        if (ymarkup < 2)
        {
            cout << "Error: ymarkup = " << ymarkup << " - incorrect value. Must be >= 2" << endl;
            return false;
        }
        zmarkup = parser.get<int>("zmarkup");
        if (zmarkup < 2)
        {
            cout << "Error: zmarkup = " << zmarkup << " - incorrect value. Must be >= 2" << endl;
            return false;
        }
        microTime = parser.get<int>("microTime");
        if (microTime <= 0)
        {
            cout << "Error: microTime = " << microTime << " - incorrect value. Must be > 0" << endl;
            return false;
        }
        frameSizeTime = parser.get<int>("frameSizeTime");
        if (frameSizeTime <= 0)
        {
            cout << "Error: frameSizeTime = " << frameSizeTime << " - incorrect value. Must be > 0" << endl;
            return false;
        }
        updateTime = parser.get<int>("updateTime");
        if (updateTime <= 0)
        {
            cout << "Error: updateTime = " << updateTime << " - incorrect value. Must be > 0" << endl;
            return false;
        }
        waitTime = parser.get<int>("waitTime");
        if (waitTime < 0)
        {
            cout << "Error: waitTime = " << waitTime << " - incorrect value. Must be >= 0" << endl;
            return false;
        }
        return true;
    }

private :
    string inputType;
    string draw;
    string graph;
    string audio;
    int audioStream;

    string windowType;
    int windLen;
    int overlap;

    bool enableGrid;

    int rows;
    int cols;

    int xmarkup;
    int ymarkup;
    int zmarkup;

    int microTime;
    int frameSizeTime;
    int updateTime;
    int waitTime;

};

int main(int argc, char** argv)
{
    const String keys =
        "{help h usage ? |               | this sample draws a volume graph and/or spectrogram of audio/video files and microphone \n\t\tDefault usage: ./Spectrogram.exe}"
        "{inputType i    | file          | file or microphone                       }"
        "{draw d         | static        | type of drawing: \n\t\t\tstatic - for plotting graph(s) across the entire input audio \n\t\t\tdynamic - for plotting graph(s) in a time-updating window}"
        "{graph g        | ampl_and_spec | type of graph: amplitude graph or/and spectrogram. Please use tags below : \n\t\t\tampl - draw the amplitude graph \n\t\t\tspec - draw the spectrogram\n\t\t\tampl_and_spec - draw the amplitude graph and spectrogram on one image under each other}"
        "{audio a        | Megamind.avi  | name and path to file                    }"
        "{audioStream s  | 1             | CAP_PROP_AUDIO_STREAM value. Select audio stream number }"
        "{windowType t   | Rect          | type of window for STFT. Please use tags below : \n\t\t\tRect/Hann/Hamming }"
        "{windLen l      | 256           | size of window for STFT                  }"
        "{overlap o      | 128           | overlap of windows for STFT              }"

        "{enableGrid     | false         | grid on the amplitude graph              }"

        "{rows r         | 400           | rows of output image                     }"
        "{cols c         | 900           | cols of output image                     }"

        "{xmarkup x      | 5             | number of x axis divisions (time asix)   }"
        "{ymarkup y      | 5             | number of y axis divisions (frequency or/and amplitude axis) }"
        "{zmarkup z      | 5             | number of z axis divisions (colorbar)    }"

        "{microTime m    | 20            | time of recording audio with microphone in seconds }"
        "{frameSizeTime f| 5             | size of sliding window in seconds        }"
        "{updateTime u   | 1             | update time of sliding window in seconds }"
        "{waitTime w     | 10            | parameter to cv.waitKey() for dynamic update of file input, takes values in milliseconds }"
        ;

    CommandLineParser parser(argc, argv, keys);
    if (parser.has("help"))
    {
        parser.printMessage();
        return 0;
    }

    AudioDrawing draw(parser);
    return 0;
}