mirror of
https://github.com/opencv/opencv.git
synced 2025-01-19 15:04:01 +08:00
d8b1fc45aa
AudioIO: add spectrogram samples for C++/python
1071 lines
40 KiB
C++
1071 lines
40 KiB
C++
#include <opencv2/core.hpp>
|
|
#include <opencv2/videoio.hpp>
|
|
#include <opencv2/highgui.hpp>
|
|
#include <opencv2/imgproc.hpp>
|
|
|
|
#include <iostream>
|
|
#include <vector>
|
|
#include <string>
|
|
#include <cmath>
|
|
using namespace cv;
|
|
using namespace std;
|
|
|
|
|
|
class AudioDrawing
|
|
{
|
|
|
|
public:
|
|
|
|
AudioDrawing(const CommandLineParser& parser) {
|
|
if (!initAndCheckArgs(parser))
|
|
{
|
|
cerr << "Error: Wrong input arguments" << endl;
|
|
exit(0);
|
|
}
|
|
Draw();
|
|
}
|
|
|
|
void Draw() {
|
|
if (draw == "static")
|
|
{
|
|
vector<int>inputAudio = {};
|
|
int samplingRate = 0;
|
|
if (inputType == "file")
|
|
{
|
|
samplingRate = readAudioFile(audio, inputAudio);
|
|
}
|
|
else if (inputType == "microphone")
|
|
{
|
|
samplingRate = readAudioMicrophone(inputAudio);
|
|
}
|
|
if ((inputAudio.size() == 0) || samplingRate <= 0)
|
|
{
|
|
cerr << "Error: problems with audio reading, check input arguments" << endl;
|
|
return;
|
|
}
|
|
|
|
int duration = static_cast<int>(inputAudio.size()) / samplingRate;
|
|
|
|
// since the dimensional grid is counted in integer seconds,
|
|
// if the input audio has an incomplete last second,
|
|
// then it is filled with zeros to complete
|
|
int remainder = static_cast<int>(inputAudio.size()) % samplingRate;
|
|
if (remainder)
|
|
{
|
|
int sizeToFullSec = samplingRate - remainder;
|
|
for (int j = 0; j < sizeToFullSec; ++j)
|
|
{
|
|
inputAudio.push_back(0);
|
|
}
|
|
duration += 1;
|
|
cout << "Update duration of audio to full last second with " <<
|
|
sizeToFullSec << " zero samples" << endl;
|
|
cout << "New number of samples " << inputAudio.size() << endl;
|
|
}
|
|
cout << "Duration of audio = " << duration << " seconds" << endl;
|
|
|
|
// since the dimensional grid is counted in integer seconds,
|
|
// if duration of file is less than xmarkup, to avoid an incorrect display,
|
|
// xmarkup will be taken equal to duration
|
|
if (duration <= xmarkup)
|
|
{
|
|
xmarkup = duration + 1;
|
|
}
|
|
|
|
if (graph == "ampl")
|
|
{
|
|
Mat imgAmplitude = drawAmplitude(inputAudio);
|
|
imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate);
|
|
imshow("Display amplitude graph", imgAmplitude);
|
|
waitKey(0);
|
|
}
|
|
else if (graph == "spec")
|
|
{
|
|
vector<vector<double>>stft = STFT(inputAudio);
|
|
Mat imgSpec = drawSpectrogram(stft);
|
|
imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft);
|
|
imshow("Display spectrogram", imgSpec);
|
|
waitKey(0);
|
|
}
|
|
else if (graph == "ampl_and_spec")
|
|
{
|
|
Mat imgAmplitude = drawAmplitude(inputAudio);
|
|
imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate);
|
|
vector<vector<double>>stft = STFT(inputAudio);
|
|
Mat imgSpec = drawSpectrogram(stft);
|
|
imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft);
|
|
Mat imgTotal = concatenateImages(imgAmplitude, imgSpec);
|
|
imshow("Display amplitude graph and spectrogram", imgTotal);
|
|
waitKey(0);
|
|
}
|
|
}
|
|
else if (draw == "dynamic")
|
|
{
|
|
if (inputType == "file")
|
|
{
|
|
dynamicFile(audio);
|
|
}
|
|
else if (inputType == "microphone")
|
|
{
|
|
dynamicMicrophone();
|
|
}
|
|
}
|
|
}
|
|
|
|
~AudioDrawing() {
|
|
}
|
|
|
|
int readAudioFile(string file, vector<int>& inputAudio)
|
|
{
|
|
VideoCapture cap;
|
|
vector<int> params { CAP_PROP_AUDIO_STREAM, audioStream,
|
|
CAP_PROP_VIDEO_STREAM, -1,
|
|
CAP_PROP_AUDIO_DATA_DEPTH, CV_16S };
|
|
|
|
cap.open(file, CAP_ANY, params);
|
|
if (!cap.isOpened())
|
|
{
|
|
cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl;
|
|
return -1;
|
|
}
|
|
const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX);
|
|
const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
|
|
cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl;
|
|
int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
|
|
cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
|
|
cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
|
|
cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
|
|
|
|
vector<int> frameVec;
|
|
Mat frame;
|
|
for (;;)
|
|
{
|
|
if (cap.grab())
|
|
{
|
|
cap.retrieve(frame, audioBaseIndex);
|
|
frameVec = frame;
|
|
inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end());
|
|
}
|
|
else
|
|
{
|
|
cout << "Number of samples: " << inputAudio.size() << endl;
|
|
break;
|
|
}
|
|
}
|
|
return samplingRate;
|
|
}
|
|
|
|
int readAudioMicrophone(vector<int>& inputAudio)
|
|
{
|
|
VideoCapture cap;
|
|
vector<int> params { CAP_PROP_AUDIO_STREAM, 0,
|
|
CAP_PROP_VIDEO_STREAM, -1 };
|
|
|
|
cap.open(0, CAP_ANY, params);
|
|
if (!cap.isOpened())
|
|
{
|
|
cerr << "Error: Can't open microphone" << endl;
|
|
return -1;
|
|
}
|
|
|
|
const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
|
|
const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
|
|
cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString( static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
|
|
int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
|
|
cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << samplingRate << endl;
|
|
cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
|
|
cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
|
|
|
|
const double cvTickFreq = getTickFrequency();
|
|
int64 sysTimeCurr = getTickCount();
|
|
int64 sysTimePrev = sysTimeCurr;
|
|
|
|
vector<int> frameVec;
|
|
Mat frame;
|
|
while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime)
|
|
{
|
|
if (cap.grab())
|
|
{
|
|
cap.retrieve(frame, audioBaseIndex);
|
|
frameVec = frame;
|
|
inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end());
|
|
sysTimeCurr = getTickCount();
|
|
}
|
|
else
|
|
{
|
|
cerr << "Error: Grab error" << endl;
|
|
break;
|
|
}
|
|
}
|
|
cout << "Number of samples: " << inputAudio.size() << endl;
|
|
return samplingRate;
|
|
}
|
|
|
|
|
|
Mat drawAmplitude(vector<int>& inputAudio)
|
|
{
|
|
Scalar color = Scalar(247,111,87);
|
|
int thickness = 5;
|
|
int frameVectorRows = 500;
|
|
int middle = frameVectorRows / 2;
|
|
// usually the input data is too big, so it is necessary
|
|
// to reduce size using interpolation of data
|
|
int frameVectorCols = 40000;
|
|
if (static_cast<int>(inputAudio.size()) < frameVectorCols)
|
|
{
|
|
frameVectorCols = static_cast<int>(inputAudio.size());
|
|
}
|
|
|
|
Mat img(frameVectorRows, frameVectorCols, CV_8UC3 , Scalar(255,255,255)); // white background
|
|
|
|
vector<double>reshapeAudio(inputAudio.size());
|
|
for (size_t i = 0; i < inputAudio.size(); ++i)
|
|
{
|
|
reshapeAudio[i]=static_cast<double>(inputAudio[i]);
|
|
}
|
|
|
|
Mat img_frameVector( 1, static_cast<int>(reshapeAudio.size()), CV_64F , reshapeAudio.data());
|
|
Mat img_frameVector_resize;
|
|
resize(img_frameVector, img_frameVector_resize, Size(frameVectorCols, 1), INTER_LINEAR);
|
|
reshapeAudio = img_frameVector_resize;
|
|
|
|
// normalization data by maximum element
|
|
normalize(reshapeAudio, reshapeAudio, 1.0, 0.0, NORM_INF);
|
|
|
|
for (size_t i = 0; i < reshapeAudio.size(); ++i)
|
|
{
|
|
reshapeAudio[i] = middle - reshapeAudio[i] * middle;
|
|
}
|
|
|
|
for (int i = 1; i < static_cast<int>(reshapeAudio.size()); ++i)
|
|
{
|
|
line(img, Point(i-1, static_cast<int>(reshapeAudio[i-1])), Point(i, static_cast<int>(reshapeAudio[i])), color, thickness);
|
|
}
|
|
Mat resImage;
|
|
resize(img, resImage, Size(900, 400), INTER_AREA );
|
|
return resImage;
|
|
}
|
|
|
|
Mat drawAmplitudeScale(Mat& inputImg, const vector<int>& inputAudio, int samplingRate,
|
|
int xmin = 0, int xmax = 0)
|
|
{
|
|
// function of layout drawing for graph of volume amplitudes
|
|
// x axis for time
|
|
// y axis for amplitudes
|
|
|
|
// parameters for the new image size
|
|
int preCol = 100;
|
|
int aftCol = 100;
|
|
int preLine = 40;
|
|
int aftLine = 50;
|
|
|
|
int frameVectorRows = inputImg.rows;
|
|
int frameVectorCols = inputImg.cols;
|
|
|
|
int totalRows = preLine + frameVectorRows + aftLine;
|
|
int totalCols = preCol + frameVectorCols + aftCol;
|
|
|
|
Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3, Scalar(255, 255, 255));
|
|
inputImg.copyTo(imgTotal(Rect(preCol, preLine, inputImg.cols, inputImg.rows)));
|
|
|
|
|
|
// calculating values on x axis
|
|
if (xmax == 0)
|
|
{
|
|
xmax = static_cast<int>(inputAudio.size()) / samplingRate;
|
|
}
|
|
std::vector<double> xList(xmarkup);
|
|
if (xmax >= xmarkup)
|
|
{
|
|
double deltax = (xmax - xmin) / (xmarkup - 1);
|
|
for (int i = 0; i < xmarkup; ++i)
|
|
{
|
|
xList[i] = (xmin + deltax * i);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// this case is used to display a dynamic update
|
|
vector<double> tmpXList;
|
|
for (int i = xmin; i < xmax; ++i)
|
|
{
|
|
tmpXList.push_back(i + 1);
|
|
}
|
|
int k = 0;
|
|
for (int i = xmarkup - static_cast<int>(tmpXList.size()); i < xmarkup; ++i)
|
|
{
|
|
xList[i] = tmpXList[k];
|
|
k += 1;
|
|
}
|
|
}
|
|
|
|
// calculating values on y axis
|
|
double minCv; double maxCv; Point minLoc; Point maxLoc;
|
|
minMaxLoc(inputAudio, &minCv, &maxCv, &minLoc, &maxLoc);
|
|
int ymin = static_cast<int>(minCv);
|
|
int ymax = static_cast<int>(maxCv);
|
|
|
|
std::vector<double> yList(ymarkup);
|
|
double deltay = (ymax - ymin) / (ymarkup - 1);
|
|
for (int i = 0; i < ymarkup; ++i)
|
|
{
|
|
yList[i] = ymin + deltay * i;
|
|
}
|
|
|
|
// parameters for layout drawing
|
|
int textThickness = 1;
|
|
int gridThickness = 1;
|
|
Scalar gridColor(0, 0, 0);
|
|
Scalar textColor(0, 0, 0);
|
|
float fontScale = 0.5;
|
|
|
|
// horizontal axis
|
|
line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine),
|
|
gridColor, gridThickness);
|
|
// vertical axis
|
|
line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows),
|
|
gridColor, gridThickness);
|
|
|
|
// parameters for layout calculation
|
|
int serifSize = 10;
|
|
int indentDownX = serifSize * 2;
|
|
int indentDownY = serifSize / 2;
|
|
int indentLeftX = serifSize;
|
|
int indentLeftY = 2 * preCol / 3;
|
|
|
|
|
|
// drawing layout for x axis
|
|
int numX = frameVectorCols / (xmarkup - 1);
|
|
for (size_t i = 0; i < xList.size(); ++i)
|
|
{
|
|
int a1 = static_cast<int>(preCol + i * numX);
|
|
int a2 = frameVectorRows + preLine;
|
|
|
|
int b1 = a1;
|
|
int b2 = a2 + serifSize;
|
|
|
|
if (enableGrid)
|
|
{
|
|
int d1 = a1;
|
|
int d2 = preLine;
|
|
line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness);
|
|
}
|
|
line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
|
|
putText(imgTotal, to_string(int(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX),
|
|
FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
|
|
}
|
|
|
|
// drawing layout for y axis
|
|
int numY = frameVectorRows / (ymarkup - 1);
|
|
for (size_t i = 0; i < yList.size(); ++i) {
|
|
int a1 = preCol;
|
|
int a2 = static_cast<int>(totalRows - aftLine - i * numY);
|
|
int b1 = preCol - serifSize;
|
|
int b2 = a2;
|
|
if (enableGrid)
|
|
{
|
|
int d1 = preCol + frameVectorCols;
|
|
int d2 = a2;
|
|
line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness);
|
|
}
|
|
line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
|
|
putText(imgTotal, to_string(int(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY),
|
|
FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
|
|
}
|
|
Mat resImage;
|
|
resize(imgTotal, resImage, Size(cols, rows), INTER_AREA );
|
|
return resImage;
|
|
}
|
|
|
|
vector<vector<double>> STFT(const vector<int>& inputAudio)
|
|
{
|
|
// The Short-time Fourier transform (STFT), is a Fourier-related transform used to
|
|
// determine the sinusoidal frequency and phase content of local sections of a signal
|
|
// as it changes over time.
|
|
// In practice, the procedure for computing STFTs is to divide a longer time signal
|
|
// into shorter segments of equal length and then compute the Fourier transform separately
|
|
// on each shorter segment. This reveals the Fourier spectrum on each shorter segment.
|
|
// One then usually plots the changing spectra as a function of time, known as a spectrogram
|
|
// or waterfall plot.
|
|
// https://en.wikipedia.org/wiki/Short-time_Fourier_transform
|
|
|
|
int timeStep = windLen - overlap;
|
|
Mat dstMat;
|
|
vector<double> stftRow;
|
|
vector<double> WindType;
|
|
if (windowType == "Hann")
|
|
{
|
|
// https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
|
|
for (int j = 1 - windLen; j < windLen; j+=2)
|
|
{
|
|
WindType.push_back(j * (0.5 * (1 - cos(CV_PI * j / (windLen - 1)))));
|
|
}
|
|
}
|
|
else if (windowType == "Hamming")
|
|
{
|
|
// https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
|
|
for (int j = 1 - windLen; j < windLen; j+=2)
|
|
{
|
|
WindType.push_back(j * (0.53836 - 0.46164 * (cos(CV_PI * j / (windLen - 1)))));
|
|
}
|
|
}
|
|
for (size_t i = 0; i < inputAudio.size(); i += timeStep)
|
|
{
|
|
vector<double>section(windLen, 0);
|
|
for (int j = 0; j < windLen; ++j)
|
|
{
|
|
section[j] = inputAudio[j + i];
|
|
}
|
|
if (windowType == "Hann" || windowType == "Hamming")
|
|
{
|
|
for (size_t j = 0; j < section.size(); ++j)
|
|
{
|
|
section[j] *= WindType[j];
|
|
}
|
|
}
|
|
|
|
dft(section, dstMat, DFT_COMPLEX_OUTPUT);
|
|
|
|
for (int j = 0; j < dstMat.cols / 4; ++j)
|
|
{
|
|
double complModule = sqrt(dstMat.at<double>(2*j) * dstMat.at<double>(2*j) +
|
|
dstMat.at<double>(2*j+1) * dstMat.at<double>(2*j+1));
|
|
stftRow.push_back(complModule);
|
|
}
|
|
}
|
|
|
|
size_t xSize = inputAudio.size() / timeStep + 1;
|
|
// we need only the first part of the spectrum, the second part is symmetrical
|
|
size_t ySize = dstMat.cols / 4;
|
|
|
|
vector<vector<double>> stft(ySize, vector<double>(xSize, 0.));
|
|
for (size_t i = 0; i < xSize; ++i)
|
|
{
|
|
for (size_t j = 0; j < ySize; ++j)
|
|
{
|
|
// write elements with transposition and convert it to the decibel scale
|
|
double stftElem = stftRow[ i * ySize + j];
|
|
if (stftElem != 0.)
|
|
{
|
|
stft[j][i] = 10 * log10(stftElem);
|
|
}
|
|
}
|
|
}
|
|
return stft;
|
|
}
|
|
|
|
Mat drawSpectrogram(const vector<vector<double>>& stft)
|
|
{
|
|
int frameVectorRows = static_cast<int>(stft.size());
|
|
int frameVectorCols = static_cast<int>(stft[0].size());
|
|
|
|
// Normalization of image values from 0 to 255 to get more contrast image
|
|
// and this normalization will be taken into account in the scale drawing
|
|
int colormapImageRows = 255;
|
|
|
|
double minCv; double maxCv; Point minLoc; Point maxLoc;
|
|
minMaxLoc(stft[0], &minCv, &maxCv, &minLoc, &maxLoc);
|
|
double maxStft = max(abs(maxCv), abs(minCv));
|
|
|
|
for (int i = 1; i < frameVectorRows; ++i)
|
|
{
|
|
minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc);
|
|
maxStft = max(maxStft, max(abs(maxCv), abs(minCv)));
|
|
}
|
|
// if maxStft is zero (silence)
|
|
if (maxStft == 0.)
|
|
{
|
|
maxStft = 1;
|
|
}
|
|
Mat imgSpec(frameVectorRows, frameVectorCols, CV_8UC1, Scalar(255, 255, 255));
|
|
|
|
for (int i = 0; i < frameVectorRows; ++i)
|
|
{
|
|
for (int j = 0; j < frameVectorCols; ++j)
|
|
{
|
|
imgSpec.at<uchar>(frameVectorRows - i - 1, j) = static_cast<uchar>(stft[i][j] * colormapImageRows / maxStft);
|
|
}
|
|
}
|
|
applyColorMap(imgSpec, imgSpec, COLORMAP_INFERNO);
|
|
Mat resImage;
|
|
resize(imgSpec, resImage, Size(900, 400), INTER_AREA);
|
|
return resImage;
|
|
}
|
|
|
|
Mat drawSpectrogramColorbar(Mat& inputImg, const vector<int>& inputAudio,
|
|
int samplingRate, const vector<vector<double>>& stft,
|
|
int xmin = 0, int xmax = 0)
|
|
{
|
|
// function of layout drawing for the three-dimensional graph of the spectrogram
|
|
// x axis for time
|
|
// y axis for frequencies
|
|
// z axis for magnitudes of frequencies shown by color scale
|
|
|
|
// parameters for the new image size
|
|
int preCol = 100;
|
|
int aftCol = 100;
|
|
int preLine = 40;
|
|
int aftLine = 50;
|
|
int colColor = 20;
|
|
int indCol = 20;
|
|
|
|
int frameVectorRows = inputImg.rows;
|
|
int frameVectorCols = inputImg.cols;
|
|
|
|
int totalRows = preLine + frameVectorRows + aftLine;
|
|
int totalCols = preCol + frameVectorCols + aftCol;
|
|
|
|
Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255));
|
|
inputImg.copyTo(imgTotal(Rect(preCol, preLine, frameVectorCols, frameVectorRows)));
|
|
|
|
// colorbar image due to drawSpectrogram(..) picture has been normalised from 255 to 0,
|
|
// so here colorbar has values from 255 to 0
|
|
int colorArrSize = 256;
|
|
Mat imgColorBar = Mat (colorArrSize, colColor, CV_8UC1 , Scalar(255,255,255));
|
|
for (int i = 0; i < colorArrSize; ++i)
|
|
{
|
|
for( int j = 0; j < colColor; ++j)
|
|
{
|
|
imgColorBar.at<uchar>(i, j) = static_cast<uchar>(colorArrSize - 1 - i); // from 255 to 0
|
|
}
|
|
}
|
|
|
|
applyColorMap(imgColorBar, imgColorBar, COLORMAP_INFERNO);
|
|
resize(imgColorBar, imgColorBar, Size(colColor, frameVectorRows), INTER_AREA);
|
|
imgColorBar.copyTo(imgTotal(Rect(preCol + frameVectorCols + indCol, preLine, colColor, frameVectorRows)));
|
|
|
|
|
|
// calculating values on x axis
|
|
if (xmax == 0)
|
|
{
|
|
xmax = static_cast<int>(inputAudio.size()) / samplingRate + 1;
|
|
}
|
|
vector<double> xList(xmarkup, 0);
|
|
if (xmax >= xmarkup)
|
|
{
|
|
double deltax = (xmax - xmin) / (xmarkup - 1);
|
|
for(int i = 0; i < xmarkup; ++i)
|
|
{
|
|
xList[i] = xmin + deltax * i;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// this case is used to display a dynamic update
|
|
vector<double> tmpXList;
|
|
for(int i = xmin; i < xmax; ++i)
|
|
{
|
|
tmpXList.push_back(i + 1);
|
|
}
|
|
int k = 0;
|
|
for (int i = xmarkup - static_cast<int>(tmpXList.size()); i < xmarkup; ++i)
|
|
{
|
|
xList[i] = tmpXList[k];
|
|
k += 1;
|
|
}
|
|
}
|
|
|
|
// calculating values on y axis
|
|
// according to the Nyquist sampling theorem,
|
|
// signal should posses frequencies equal to half of sampling rate
|
|
int ymin = 0;
|
|
int ymax = static_cast<int>(samplingRate / 2);
|
|
|
|
vector<double> yList;
|
|
double deltay = (ymax - ymin) / (ymarkup - 1);
|
|
for(int i = 0; i < ymarkup; ++i)
|
|
{
|
|
yList.push_back(ymin + deltay * i);
|
|
}
|
|
|
|
// calculating values on z axis
|
|
double minCv; double maxCv; Point minLoc; Point maxLoc;
|
|
minMaxLoc( stft[0], &minCv, &maxCv, &minLoc, &maxLoc);
|
|
double zmin = minCv, zmax = maxCv;
|
|
|
|
std::vector<double> zList;
|
|
for (size_t i = 1; i < stft.size(); ++i)
|
|
{
|
|
minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc);
|
|
zmax = max(zmax, maxCv);
|
|
zmin = min(zmin, minCv);
|
|
}
|
|
double deltaz = (zmax - zmin) / (zmarkup - 1);
|
|
for(int i = 0; i < zmarkup; ++i)
|
|
{
|
|
zList.push_back(zmin + deltaz * i);
|
|
}
|
|
|
|
// parameters for layout drawing
|
|
int textThickness = 1;
|
|
int gridThickness = 1;
|
|
Scalar gridColor(0,0,0);
|
|
Scalar textColor(0,0,0);
|
|
float fontScale = 0.5;
|
|
|
|
int serifSize = 10;
|
|
int indentDownX = serifSize * 2;
|
|
int indentDownY = serifSize / 2;
|
|
int indentLeftX = serifSize;
|
|
int indentLeftY = 2 * preCol / 3;
|
|
|
|
// horizontal axis
|
|
line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine),
|
|
gridColor, gridThickness);
|
|
// vertical axis
|
|
line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows),
|
|
gridColor, gridThickness);
|
|
|
|
// drawing layout for x axis
|
|
int numX = frameVectorCols / (xmarkup - 1);
|
|
for (size_t i = 0; i < xList.size(); ++i)
|
|
{
|
|
int a1 = static_cast<int>(preCol + i * numX);
|
|
int a2 = frameVectorRows + preLine;
|
|
|
|
int b1 = a1;
|
|
int b2 = a2 + serifSize;
|
|
|
|
line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
|
|
putText(imgTotal, to_string(static_cast<int>(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX),
|
|
FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
|
|
}
|
|
|
|
// drawing layout for y axis
|
|
int numY = frameVectorRows / (ymarkup - 1);
|
|
for (size_t i = 0; i < yList.size(); ++i)
|
|
{
|
|
int a1 = preCol;
|
|
int a2 = static_cast<int>(totalRows - aftLine - i * numY);
|
|
|
|
int b1 = preCol - serifSize;
|
|
int b2 = a2;
|
|
|
|
line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
|
|
putText(imgTotal, to_string(static_cast<int>(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY),
|
|
FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
|
|
}
|
|
|
|
// drawing layout for z axis
|
|
int numZ = frameVectorRows / (zmarkup - 1);
|
|
for (size_t i = 0; i < zList.size(); ++i)
|
|
{
|
|
int a1 = preCol + frameVectorCols + indCol + colColor;
|
|
int a2 = static_cast<int>(totalRows - aftLine - i * numZ);
|
|
|
|
int b1 = a1 + serifSize;
|
|
int b2 = a2;
|
|
|
|
line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness);
|
|
putText(imgTotal, to_string(static_cast<int>(zList[i])), Point(b1 + 10, b2 + indentDownY),
|
|
FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness);
|
|
}
|
|
Mat resImage;
|
|
resize(imgTotal, resImage, Size(cols, rows), INTER_AREA );
|
|
return resImage;
|
|
}
|
|
|
|
Mat concatenateImages(Mat& img1, Mat& img2)
|
|
{
|
|
// first image will be under the second image
|
|
int totalRows = img1.rows + img2.rows;
|
|
int totalCols = max(img1.cols , img2.cols);
|
|
// if images columns do not match, the difference is filled in white
|
|
Mat imgTotal = Mat (totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255));
|
|
|
|
img1.copyTo(imgTotal(Rect(0, 0, img1.cols, img1.rows)));
|
|
img2.copyTo(imgTotal(Rect(0, img1.rows, img2.cols, img2.rows)));
|
|
return imgTotal;
|
|
}
|
|
|
|
void dynamicFile(const string file)
|
|
{
|
|
VideoCapture cap;
|
|
vector<int> params { CAP_PROP_AUDIO_STREAM, audioStream,
|
|
CAP_PROP_VIDEO_STREAM, -1,
|
|
CAP_PROP_AUDIO_DATA_DEPTH, CV_16S };
|
|
|
|
cap.open(file, CAP_ANY, params);
|
|
if (!cap.isOpened())
|
|
{
|
|
cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl;
|
|
return;
|
|
}
|
|
|
|
const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
|
|
const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
|
|
int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
|
|
|
|
cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
|
|
cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
|
|
cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
|
|
cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
|
|
|
|
int step = static_cast<int>(updateTime * samplingRate);
|
|
int frameSize = static_cast<int>(frameSizeTime * samplingRate);
|
|
|
|
// since the dimensional grid is counted in integer seconds,
|
|
// if duration of audio frame is less than xmarkup, to avoid an incorrect display,
|
|
// xmarkup will be taken equal to duration
|
|
if (frameSizeTime <= xmarkup)
|
|
{
|
|
xmarkup = frameSizeTime;
|
|
}
|
|
|
|
vector<int> buffer;
|
|
vector<int> frameVector;
|
|
vector<int> section(frameSize, 0);
|
|
vector<vector<double>>stft;
|
|
Mat frame, imgAmplitude, imgSpec, imgTotal;
|
|
int currentSamples = 0;
|
|
int xmin = 0;
|
|
int xmax = 0;
|
|
|
|
for (;;)
|
|
{
|
|
if (cap.grab())
|
|
{
|
|
cap.retrieve(frame, audioBaseIndex);
|
|
frameVector = frame;
|
|
buffer.insert(buffer.end(), frameVector.begin(), frameVector.end());
|
|
int bufferSize = static_cast<int>(buffer.size());
|
|
if (bufferSize >= step)
|
|
{
|
|
currentSamples += bufferSize;
|
|
section.erase(section.begin(), section.begin() + step);
|
|
section.insert(section.end(), buffer.begin(), buffer.end());
|
|
buffer.erase(buffer.begin(), buffer.begin() + step);
|
|
if (currentSamples < frameSize)
|
|
{
|
|
xmin = 0;
|
|
xmax = (currentSamples) / samplingRate;
|
|
}
|
|
else
|
|
{
|
|
xmin = (currentSamples - frameSize) / samplingRate + 1;
|
|
xmax = (currentSamples) / samplingRate;
|
|
}
|
|
|
|
if (graph == "ampl")
|
|
{
|
|
imgAmplitude = drawAmplitude(section);
|
|
imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
|
|
imshow("Display amplitude graph", imgAmplitude);
|
|
waitKey(waitTime);
|
|
}
|
|
else if (graph == "spec")
|
|
{
|
|
stft = STFT(section);
|
|
imgSpec = drawSpectrogram(stft);
|
|
imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
|
|
imshow("Display spectrogram", imgSpec);
|
|
waitKey(waitTime);
|
|
}
|
|
else if (graph == "ampl_and_spec")
|
|
{
|
|
imgAmplitude = drawAmplitude(section);
|
|
imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
|
|
stft = STFT(section);
|
|
imgSpec = drawSpectrogram(stft);
|
|
imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
|
|
imgTotal = concatenateImages(imgAmplitude, imgSpec);
|
|
imshow("Display amplitude graph and spectrogram", imgTotal);
|
|
waitKey(waitTime);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
void dynamicMicrophone()
|
|
{
|
|
VideoCapture cap;
|
|
vector<int> params { CAP_PROP_AUDIO_STREAM, 0,
|
|
CAP_PROP_VIDEO_STREAM, -1 };
|
|
|
|
cap.open(0, CAP_MSMF, params);
|
|
if (!cap.isOpened())
|
|
{
|
|
cerr << "Error: Can't open microphone" << endl;
|
|
return;
|
|
}
|
|
|
|
const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX));
|
|
const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS));
|
|
int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND));
|
|
cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl;
|
|
cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
|
|
cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
|
|
cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
|
|
|
|
const double cvTickFreq = getTickFrequency();
|
|
int64 sysTimeCurr = getTickCount();
|
|
int64 sysTimePrev = sysTimeCurr;
|
|
|
|
int step = (updateTime * samplingRate);
|
|
int frameSize = (frameSizeTime * samplingRate);
|
|
// since the dimensional grid is counted in integer seconds,
|
|
// if duration of audio frame is less than xmarkup, to avoid an incorrect display,
|
|
// xmarkup will be taken equal to duration
|
|
if (frameSizeTime <= xmarkup)
|
|
{
|
|
xmarkup = frameSizeTime;
|
|
}
|
|
|
|
vector<int> frameVector;
|
|
vector<int> buffer;
|
|
vector<int> section(frameSize, 0);
|
|
Mat frame, imgAmplitude, imgSpec, imgTotal;
|
|
|
|
int currentSamples = 0;
|
|
vector<vector<double>> stft;
|
|
int xmin = 0;
|
|
int xmax = 0;
|
|
waitTime = updateTime * 1000;
|
|
while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime)
|
|
{
|
|
if (cap.grab())
|
|
{
|
|
cap.retrieve(frame, audioBaseIndex);
|
|
frameVector = frame;
|
|
buffer.insert(buffer.end(), frameVector.begin(), frameVector.end());
|
|
sysTimeCurr = getTickCount();
|
|
|
|
int bufferSize = static_cast<int>(buffer.size());
|
|
if (bufferSize >= step)
|
|
{
|
|
currentSamples += step;
|
|
section.erase(section.begin(), section.begin() + step);
|
|
section.insert(section.end(), buffer.begin(), buffer.end());
|
|
buffer.erase(buffer.begin(), buffer.begin() + step);
|
|
|
|
if (currentSamples < frameSize)
|
|
{
|
|
xmin = 0;
|
|
xmax = (currentSamples) / samplingRate;
|
|
}
|
|
else
|
|
{
|
|
xmin = (currentSamples - frameSize) / samplingRate + 1;
|
|
xmax = (currentSamples) / samplingRate;
|
|
}
|
|
|
|
if (graph == "ampl")
|
|
{
|
|
imgAmplitude = drawAmplitude(section);
|
|
imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
|
|
imshow("Display amplitude graph", imgAmplitude);
|
|
waitKey(waitTime);
|
|
}
|
|
else if (graph == "spec")
|
|
{
|
|
stft = STFT(section);
|
|
imgSpec = drawSpectrogram(stft);
|
|
imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
|
|
imshow("Display spectrogram", imgSpec);
|
|
waitKey(waitTime);
|
|
}
|
|
else if (graph == "ampl_and_spec")
|
|
{
|
|
imgAmplitude = drawAmplitude(section);
|
|
imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax);
|
|
stft = STFT(section);
|
|
imgSpec = drawSpectrogram(stft);
|
|
imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax);
|
|
imgTotal = concatenateImages(imgAmplitude, imgSpec);
|
|
imshow("Display amplitude graph and spectrogram", imgTotal);
|
|
waitKey(waitTime);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
cerr << "Error: Grab error" << endl;
|
|
break;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
bool initAndCheckArgs(const CommandLineParser& parser)
|
|
{
|
|
inputType = parser.get<string>("inputType");
|
|
if ((inputType != "file") && (inputType != "microphone"))
|
|
{
|
|
cout << "Error: " << inputType << " input method doesnt exist" << endl;
|
|
return false;
|
|
}
|
|
|
|
draw = parser.get<string>("draw");
|
|
if ((draw != "static") && (draw != "dynamic"))
|
|
{
|
|
cout << "Error: " << draw << " draw type doesnt exist" << endl;
|
|
return false;
|
|
}
|
|
|
|
graph = parser.get<string>("graph");
|
|
if ((graph != "ampl") && (graph != "spec") && (graph != "ampl_and_spec"))
|
|
{
|
|
cout << "Error: " << graph << " type of graph doesnt exist" << endl;
|
|
return false;
|
|
}
|
|
|
|
audio = samples::findFile(parser.get<std::string>("audio"));
|
|
|
|
audioStream = parser.get<int>("audioStream");
|
|
if (audioStream < 0)
|
|
{
|
|
cout << "Error: audioStream = " << audioStream << " - incorrect value. Must be >= 0" << endl;
|
|
return false;
|
|
}
|
|
windowType = parser.get<string>("windowType");
|
|
if ((windowType != "Rect") && (windowType != "Hann") && (windowType != "Hamming"))
|
|
{
|
|
cout << "Error: " << windowType << " type of window doesnt exist" << endl;
|
|
return false;
|
|
}
|
|
|
|
windLen = parser.get<int>("windLen");
|
|
if (windLen <= 0)
|
|
{
|
|
cout << "Error: windLen = " << windLen << " - incorrect value. Must be > 0" << endl;
|
|
return false;
|
|
}
|
|
|
|
overlap = parser.get<int>("overlap");
|
|
if (overlap <= 0)
|
|
{
|
|
cout << "Error: overlap = " << overlap << " - incorrect value. Must be > 0" << endl;
|
|
return false;
|
|
}
|
|
|
|
enableGrid = parser.get<bool>("enableGrid");
|
|
|
|
rows = parser.get<int>("rows");
|
|
if (rows <= 0)
|
|
{
|
|
cout << "Error: rows = " << rows << " - incorrect value. Must be > 0" << endl;
|
|
return false;
|
|
}
|
|
cols = parser.get<int>("cols");
|
|
|
|
if (cols <= 0)
|
|
{
|
|
cout << "Error: cols = " << cols << " - incorrect value. Must be > 0" << endl;
|
|
return false;
|
|
}
|
|
xmarkup = parser.get<int>("xmarkup");
|
|
if (xmarkup < 2)
|
|
{
|
|
cout << "Error: xmarkup = " << xmarkup << " - incorrect value. Must be >= 2" << endl;
|
|
return false;
|
|
}
|
|
ymarkup = parser.get<int>("ymarkup");
|
|
if (ymarkup < 2)
|
|
{
|
|
cout << "Error: ymarkup = " << ymarkup << " - incorrect value. Must be >= 2" << endl;
|
|
return false;
|
|
}
|
|
zmarkup = parser.get<int>("zmarkup");
|
|
if (zmarkup < 2)
|
|
{
|
|
cout << "Error: zmarkup = " << zmarkup << " - incorrect value. Must be >= 2" << endl;
|
|
return false;
|
|
}
|
|
microTime = parser.get<int>("microTime");
|
|
if (microTime <= 0)
|
|
{
|
|
cout << "Error: microTime = " << microTime << " - incorrect value. Must be > 0" << endl;
|
|
return false;
|
|
}
|
|
frameSizeTime = parser.get<int>("frameSizeTime");
|
|
if (frameSizeTime <= 0)
|
|
{
|
|
cout << "Error: frameSizeTime = " << frameSizeTime << " - incorrect value. Must be > 0" << endl;
|
|
return false;
|
|
}
|
|
updateTime = parser.get<int>("updateTime");
|
|
if (updateTime <= 0)
|
|
{
|
|
cout << "Error: updateTime = " << updateTime << " - incorrect value. Must be > 0" << endl;
|
|
return false;
|
|
}
|
|
waitTime = parser.get<int>("waitTime");
|
|
if (waitTime < 0)
|
|
{
|
|
cout << "Error: waitTime = " << waitTime << " - incorrect value. Must be >= 0" << endl;
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
private :
|
|
string inputType;
|
|
string draw;
|
|
string graph;
|
|
string audio;
|
|
int audioStream;
|
|
|
|
string windowType;
|
|
int windLen;
|
|
int overlap;
|
|
|
|
bool enableGrid;
|
|
|
|
int rows;
|
|
int cols;
|
|
|
|
int xmarkup;
|
|
int ymarkup;
|
|
int zmarkup;
|
|
|
|
int microTime;
|
|
int frameSizeTime;
|
|
int updateTime;
|
|
int waitTime;
|
|
|
|
};
|
|
|
|
int main(int argc, char** argv)
|
|
{
|
|
const String keys =
|
|
"{help h usage ? | | this sample draws a volume graph and/or spectrogram of audio/video files and microphone \n\t\tDefault usage: ./Spectrogram.exe}"
|
|
"{inputType i | file | file or microphone }"
|
|
"{draw d | static | type of drawing: \n\t\t\tstatic - for plotting graph(s) across the entire input audio \n\t\t\tdynamic - for plotting graph(s) in a time-updating window}"
|
|
"{graph g | ampl_and_spec | type of graph: amplitude graph or/and spectrogram. Please use tags below : \n\t\t\tampl - draw the amplitude graph \n\t\t\tspec - draw the spectrogram\n\t\t\tampl_and_spec - draw the amplitude graph and spectrogram on one image under each other}"
|
|
"{audio a | Megamind.avi | name and path to file }"
|
|
"{audioStream s | 1 | CAP_PROP_AUDIO_STREAM value. Select audio stream number }"
|
|
"{windowType t | Rect | type of window for STFT. Please use tags below : \n\t\t\tRect/Hann/Hamming }"
|
|
"{windLen l | 256 | size of window for STFT }"
|
|
"{overlap o | 128 | overlap of windows for STFT }"
|
|
|
|
"{enableGrid | false | grid on the amplitude graph }"
|
|
|
|
"{rows r | 400 | rows of output image }"
|
|
"{cols c | 900 | cols of output image }"
|
|
|
|
"{xmarkup x | 5 | number of x axis divisions (time asix) }"
|
|
"{ymarkup y | 5 | number of y axis divisions (frequency or/and amplitude axis) }"
|
|
"{zmarkup z | 5 | number of z axis divisions (colorbar) }"
|
|
|
|
"{microTime m | 20 | time of recording audio with microphone in seconds }"
|
|
"{frameSizeTime f| 5 | size of sliding window in seconds }"
|
|
"{updateTime u | 1 | update time of sliding window in seconds }"
|
|
"{waitTime w | 10 | parameter to cv.waitKey() for dynamic update of file input, takes values in milliseconds }"
|
|
;
|
|
|
|
CommandLineParser parser(argc, argv, keys);
|
|
if (parser.has("help"))
|
|
{
|
|
parser.printMessage();
|
|
return 0;
|
|
}
|
|
|
|
AudioDrawing draw(parser);
|
|
return 0;
|
|
} |