diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index c3ae702d9..af0d01121 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -13,7 +13,7 @@ from utils.redis_utils import get_user_speech_profile, get_user_speech_profile_duration from utils.stt.deepgram_util import process_audio_dg, send_initial_file2, transcribe_file_deepgram -from utils.stt.vad import VADIterator, model, get_speech_state, SpeechState, vad_is_empty, is_speech_present +from utils.stt.vad import VADIterator, model, vad_is_empty, is_speech_present router = APIRouter() @@ -54,7 +54,7 @@ async def _websocket_util( websocket_active = True duration = 0 is_speech_active = False - speech_timeout = 0.7 # Good for now but better dynamically adjust it by user behaviour + speech_timeout = 0.7 # Good for now but better dynamically adjust it by user behaviour, just idea: Increase as active time passes but until certain threshold, but not needed yet. last_speech_time = 0 try: if language == 'en' and codec == 'opus' and include_speech_profile: @@ -89,26 +89,20 @@ async def receive_audio(socket1, socket2): databuffer = bytearray(b"") REALTIME_RESOLUTION = 0.01 - sample_width = 2 # pcm here is 16 bit + sample_width = 2 # pcm8/16 here is 16 bit byte_rate = sample_width * sample_rate * channels chunk_size = int(byte_rate * REALTIME_RESOLUTION) timer_start = time.time() - speech_state = SpeechState.no_speech - voice_found, not_voice = 0, 0 - # path = 'scripts/vad/audio_bytes.txt' - # if os.path.exists(path): - # os.remove(path) - # audio_file = open(path, "a") try: - sample_width = 1 if codec == "pcm8" else 2 while websocket_active: data = await websocket.receive_bytes() if codec == 'opus': decoded_opus = decoder.decode(data, frame_size=320) samples = torch.frombuffer(decoded_opus, dtype=torch.int16).float() / 32768.0 - elif codec in ['pcm8', 'pcm16']: # Both are now 16-bit - samples = torch.frombuffer(data, dtype=torch.int16).float() / 32768.0 + elif codec in ['pcm8', 'pcm16']: # Both are 16 bit + writable_data = bytearray(data) + samples = torch.frombuffer(writable_data, dtype=torch.int16).float() / 32768.0 else: raise ValueError(f"Unsupported codec: {codec}") @@ -144,8 +138,6 @@ async def receive_audio(socket1, socket2): else: socket2.send(audio_buffer) - audio_buffer = bytearray() - except WebSocketDisconnect: print("WebSocket disconnected") except Exception as e: diff --git a/backend/utils/stt/vad.py b/backend/utils/stt/vad.py index 04512d32d..8fdf9e56f 100644 --- a/backend/utils/stt/vad.py +++ b/backend/utils/stt/vad.py @@ -19,8 +19,7 @@ (get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils -def get_speech_state(data, vad_iterator, window_size_samples=256): - has_start, has_end = False, False +def is_speech_present(data, vad_iterator, window_size_samples=256): for i in range(0, len(data), window_size_samples): chunk = data[i: i + window_size_samples] if len(chunk) < window_size_samples: