add OpenCV audio reading

2025-08-06 06:26:29 +08:00 · 2021-12-05 17:58:44 +03:00 · 2021-12-05 17:58:44 +03:00 · f09a577ab5
commit f09a577ab5
parent 73318fd514
1 changed files with 87 additions and 26 deletions
--- a/samples/dnn/speech_recognition.py
+++ b/samples/dnn/speech_recognition.py
@ -2,7 +2,6 @@ import numpy as np
 import cv2 as cv
 import argparse
 import os
-import soundfile as sf # Temporary import to load audio files

 '''
 You can download the converted onnx model from https://drive.google.com/drive/folders/1wLtxyao4ItAg8tt4Sb63zt6qXzhcQoR6?usp=sharing
@ -399,11 +398,6 @@ def predict(features, net, decoder):
            decoder : Decoder object
        return : Predicted text
    '''
-    # This is a workaround https://github.com/opencv/opencv/issues/19091
-    # expanding 1 dimentions allows us to pass it to the network
-    # from python. This should be resolved in the future.
-    features = np.expand_dims(features,axis=3)
-
    # make prediction
    net.setInput(features)
    output = net.forward()
@ -412,6 +406,63 @@ def predict(features, net, decoder):
    prediction = decoder.decode(output.squeeze(0))
    return prediction[0]

+def readAudioFile(file, audioStream):
+    cap = cv.VideoCapture(file)
+    samplingRate = 16000
+    params = np.asarray([cv.CAP_PROP_AUDIO_STREAM, audioStream,
+              cv.CAP_PROP_VIDEO_STREAM, -1,
+              cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_32F,
+              cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, samplingRate
+              ])
+    cap.open(file, cv.CAP_ANY, params)
+    if cap.isOpened() is False:
+        print("Error : Can't read audio file:", file, "with audioStream = ", audioStream)
+        return
+    audioBaseIndex = int (cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
+    inputAudio = []
+    while(1):
+        if (cap.grab()):
+            frame = np.asarray([])
+            frame = cap.retrieve(frame, audioBaseIndex)
+            for i in range(len(frame[1][0])):
+                inputAudio.append(frame[1][0][i])
+        else:
+            break
+    inputAudio = np.asarray(inputAudio, dtype=np.float64)
+    return inputAudio, samplingRate
+
+def readAudioMicrophone(microTime):
+    cap = cv.VideoCapture()
+    samplingRate = 16000
+    params = np.asarray([cv.CAP_PROP_AUDIO_STREAM, 0,
+              cv.CAP_PROP_VIDEO_STREAM, -1,
+              cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_32F,
+              cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, samplingRate
+              ])
+    cap.open(0, cv.CAP_ANY, params)
+    if cap.isOpened() is False:
+        print("Error: Can't open microphone")
+        print("Error: problems with audio reading, check input arguments")
+        return
+    audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
+    cvTickFreq = cv.getTickFrequency()
+    sysTimeCurr = cv.getTickCount()
+    sysTimePrev = sysTimeCurr
+    inputAudio = []
+    while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime):
+        if (cap.grab()):
+            frame = np.asarray([])
+            frame = cap.retrieve(frame, audioBaseIndex)
+            for i in range(len(frame[1][0])):
+                inputAudio.append(frame[1][0][i])
+            sysTimeCurr = cv.getTickCount()
+        else:
+            print("Error: Grab error")
+            break
+    inputAudio = np.asarray(inputAudio, dtype=np.float64)
+    print("Number of samples: ", len(inputAudio))
+    return inputAudio, samplingRate
+
 if __name__ == '__main__':

    # Computation backends supported by layers
@ -421,7 +472,10 @@ if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='This script runs Jasper Speech recognition model',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--input_audio', type=str, required=True, help='Path to input audio file. OR Path to a txt file with relative path to multiple audio files in different lines')
+    parser.add_argument('--input_type', type=str, required=True, help='file or microphone')
+    parser.add_argument('--micro_time', type=int, default=15, help='Duration of microphone work in seconds. Must be more than 6 sec')
+    parser.add_argument('--input_audio', type=str, help='Path to input audio file. OR Path to a txt file with relative path to multiple audio files in different lines')
+    parser.add_argument('--audio_stream', type=int, default=0, help='CAP_PROP_AUDIO_STREAM value')
    parser.add_argument('--show_spectrogram', action='store_true', help='Whether to show a spectrogram of the input audio.')
    parser.add_argument('--model', type=str, default='jasper.onnx', help='Path to the onnx file of Jasper. default="jasper.onnx"')
    parser.add_argument('--output', type=str, help='Path to file where recognized audio transcript must be saved. Leave this to print on console.')
@ -442,6 +496,9 @@ if __name__ == '__main__':
        raise OSError("Input audio file does not exist")
    if not os.path.isfile(args.model):
        raise OSError("Jasper model file does not exist")
+
+    features = []
+    if args.input_type == "file":
        if args.input_audio.endswith('.txt'):
            with open(args.input_audio) as f:
                content = f.readlines()
@ -455,15 +512,19 @@ if __name__ == '__main__':
        audio_file_paths = [os.path.abspath(x) for x in audio_file_paths]

        # Read audio Files
-    features = []
-    try:
        for audio_file_path in audio_file_paths:
-            audio = sf.read(audio_file_path)
-            # If audio is stereo, just take one channel.
-            X = audio[0] if audio[0].ndim==1 else audio[0][:,0]
-            features.append(X)
-    except:
-        raise Exception(f"Soundfile cannot read {args.input_audio}. Try a different format")
+            audio = readAudioFile(audio_file_path, args.audio_stream)
+            if audio is None:
+                raise Exception(f"Can't read {args.input_audio}. Try a different format")
+            features.append(audio[0])
+    elif args.input_type == "microphone":
+        # Read audio from microphone
+        audio = readAudioMicrophone(args.micro_time)
+        if audio is None:
+            raise Exception(f"Can't open microphone. Try a different format")
+        features.append(audio[0])
+    else:
+        raise Exception(f"input_type {args.input_type} doesn't exist. Please enter 'file' or 'microphone'")

    # Get Filterbank Features
    feature_extractor = FilterbankFeatures()