helper/src/speech.py

43 lines
1.4 KiB
Python

from typing import Final
import requests
from scipy.io import wavfile
class Speech:
__server: Final[str] = 'http://vosk.athene.tech'
@staticmethod
def __check_wav(wav_file):
sample_rate, sig = wavfile.read(wav_file)
channels = len(sig.shape)
bits = sig.dtype.base.name
if sample_rate != 16000:
raise Exception(f'Sample rate is not 16000: {sample_rate}')
if channels != 1:
raise Exception(f'Number of Channels is not 1 (Not mono): {channels}')
if bits != 'int16':
raise Exception(f'Bits per sample 16: {bits}')
@staticmethod
def __load_wav(wav_file):
Speech.__check_wav(wav_file)
with open(wav_file, 'rb') as file:
result = file.read()
return result
def __stt(self, wav_file):
print(f'Connecting to \'{self.__server}\'...')
response = requests.post(url=f'{self.__server}/stt',
data=Speech.__load_wav(wav_file),
headers={'Content-Type': 'audio/wav'})
result = response.json()
if response.status_code != requests.codes.ok:
response.raise_for_status()
return result['text'] if not result['code'] else f'Server error: {result}'
def run_recognition(self, wav_file: str) -> str:
return self.__stt(wav_file)