From 819188fd5fb8bb6b26c27bfff249adcdd4df5873 Mon Sep 17 00:00:00 2001 From: Joan Cabezas Date: Fri, 20 Sep 2024 00:35:11 -0700 Subject: [PATCH] whisper x postprocessing extra logs --- backend/routers/transcribe.py | 3 ++- backend/utils/memories/postprocess_memory.py | 18 +++++++++++------- backend/utils/stt/vad.py | 2 ++ 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index ed594df50..681e8e898 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -133,10 +133,11 @@ async def _websocket_util( if stt_service == STTService.soniox and ( sample_rate != 16000 or codec != 'opus' or language not in soniox_valid_languages): stt_service = STTService.deepgram - if stt_service == STTService.speechmatics and (sample_rate != 16000 or codec != 'opus'): stt_service = STTService.deepgram + # At some point try running all the models together to easily compare + # Check: Why do we need try-catch around websocket.accept? try: await websocket.accept() diff --git a/backend/utils/memories/postprocess_memory.py b/backend/utils/memories/postprocess_memory.py index 4cb1670cd..be9762f8a 100644 --- a/backend/utils/memories/postprocess_memory.py +++ b/backend/utils/memories/postprocess_memory.py @@ -19,32 +19,36 @@ def postprocess_memory(memory_id: str, file_path: str, uid: str, emotional_feedback: bool, streaming_model: str): memory_data = _get_memory_by_id(uid, memory_id) if not memory_data: - return (404, "Memory not found") + return 404, "Memory not found" memory = Memory(**memory_data) if memory.discarded: print('postprocess_memory: Memory is discarded') - return (400, "Memory is discarded") + return 400, "Memory is discarded" if memory.postprocessing is not None and memory.postprocessing.status != PostProcessingStatus.not_started: print(f'postprocess_memory: Memory can\'t be post-processed again {memory.postprocessing.status}') - return (400, "Memory can't be post-processed again") + return 400, "Memory can't be post-processed again" aseg = AudioSegment.from_wav(file_path) if aseg.duration_seconds < 10: # TODO: validate duration more accurately, segment.last.end - segment.first.start - 10 # TODO: fix app, sometimes audio uploaded is wrong, is too short. print('postprocess_memory: Audio duration is too short, seems wrong.') memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.canceled) - return (500, "Audio duration is too short, seems wrong.") + return 500, "Audio duration is too short, seems wrong." memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.in_progress) try: # Calling VAD to avoid processing empty parts and getting hallucinations from whisper. + # TODO: use this logs to determine if whisperx is failing because of the VAD results. + print('previous to vad_is_empty (segments duration):', + memory.transcript_segments[-1].end - memory.transcript_segments[0].start) vad_segments = vad_is_empty(file_path, return_segments=True) if vad_segments: start = vad_segments[0]['start'] end = vad_segments[-1]['end'] + print('vad_is_empty file result segments:', start, end) aseg = AudioSegment.from_wav(file_path) aseg = aseg[max(0, (start - 1) * 1000):min((end + 1) * 1000, aseg.duration_seconds * 1000)] aseg.export(file_path, format="wav") @@ -90,7 +94,7 @@ def postprocess_memory(memory_id: str, file_path: str, uid: str, emotional_feedb memory.postprocessing = MemoryPostProcessing( status=PostProcessingStatus.failed, model=PostProcessingModel.fal_whisperx) # TODO: consider doing process_memory, if any segment still matched to user or people - return (200, memory) + return 200, memory # Reprocess memory with improved transcription result: Memory = process_memory(uid, memory.language, memory, force_process=True) @@ -101,13 +105,13 @@ def postprocess_memory(memory_id: str, file_path: str, uid: str, emotional_feedb except Exception as e: print(e) memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.failed, fail_reason=str(e)) - return (500, str(e)) + return 500, str(e) memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.completed) result.postprocessing = MemoryPostProcessing( status=PostProcessingStatus.completed, model=PostProcessingModel.fal_whisperx) - return (200, result) + return 200, result def _get_memory_by_id(uid: str, memory_id: str) -> dict: diff --git a/backend/utils/stt/vad.py b/backend/utils/stt/vad.py index a5ca374d8..9870cbacf 100644 --- a/backend/utils/stt/vad.py +++ b/backend/utils/stt/vad.py @@ -38,6 +38,8 @@ def is_audio_empty(file_path, sample_rate=8000): def vad_is_empty(file_path, return_segments: bool = False): """Uses vad_modal/vad.py deployment (Best quality)""" try: + file_duration = AudioSegment.from_wav(file_path).duration_seconds + print('vad_is_empty file duration:', file_duration) with open(file_path, 'rb') as file: files = {'file': (file_path.split('/')[-1], file, 'audio/wav')} response = requests.post(os.getenv('HOSTED_VAD_API_URL'), files=files)