whisper x postprocessing extra logs

BasedHardware · Sep 20, 2024 · 819188f · 819188f
1 parent b93dfd9
commit 819188f
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 8 deletions.
diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py
@@ -133,10 +133,11 @@ async def _websocket_util(
     if stt_service == STTService.soniox and (
             sample_rate != 16000 or codec != 'opus' or language not in soniox_valid_languages):
         stt_service = STTService.deepgram
-
     if stt_service == STTService.speechmatics and (sample_rate != 16000 or codec != 'opus'):
         stt_service = STTService.deepgram
 
+    # At some point try running all the models together to easily compare
+
     # Check: Why do we need try-catch around websocket.accept?
     try:
         await websocket.accept()

diff --git a/backend/utils/memories/postprocess_memory.py b/backend/utils/memories/postprocess_memory.py
@@ -19,32 +19,36 @@
 def postprocess_memory(memory_id: str, file_path: str, uid: str, emotional_feedback: bool, streaming_model: str):
     memory_data = _get_memory_by_id(uid, memory_id)
     if not memory_data:
-        return (404, "Memory not found")
+        return 404, "Memory not found"
 
     memory = Memory(**memory_data)
     if memory.discarded:
         print('postprocess_memory: Memory is discarded')
-        return (400, "Memory is discarded")
+        return 400, "Memory is discarded"
 
     if memory.postprocessing is not None and memory.postprocessing.status != PostProcessingStatus.not_started:
         print(f'postprocess_memory: Memory can\'t be post-processed again {memory.postprocessing.status}')
-        return (400, "Memory can't be post-processed again")
+        return 400, "Memory can't be post-processed again"
 
     aseg = AudioSegment.from_wav(file_path)
     if aseg.duration_seconds < 10:  # TODO: validate duration more accurately, segment.last.end - segment.first.start - 10
         # TODO: fix app, sometimes audio uploaded is wrong, is too short.
         print('postprocess_memory: Audio duration is too short, seems wrong.')
         memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.canceled)
-        return (500, "Audio duration is too short, seems wrong.")
+        return 500, "Audio duration is too short, seems wrong."
 
     memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.in_progress)
 
     try:
         # Calling VAD to avoid processing empty parts and getting hallucinations from whisper.
+        # TODO: use this logs to determine if whisperx is failing because of the VAD results.
+        print('previous to vad_is_empty (segments duration):',
+              memory.transcript_segments[-1].end - memory.transcript_segments[0].start)
         vad_segments = vad_is_empty(file_path, return_segments=True)
         if vad_segments:
             start = vad_segments[0]['start']
             end = vad_segments[-1]['end']
+            print('vad_is_empty file result segments:', start, end)
             aseg = AudioSegment.from_wav(file_path)
             aseg = aseg[max(0, (start - 1) * 1000):min((end + 1) * 1000, aseg.duration_seconds * 1000)]
             aseg.export(file_path, format="wav")
@@ -90,7 +94,7 @@ def postprocess_memory(memory_id: str, file_path: str, uid: str, emotional_feedb
             memory.postprocessing = MemoryPostProcessing(
                 status=PostProcessingStatus.failed, model=PostProcessingModel.fal_whisperx)
             # TODO: consider doing process_memory, if any segment still matched to user or people
-            return (200, memory)
+            return 200, memory
 
         # Reprocess memory with improved transcription
         result: Memory = process_memory(uid, memory.language, memory, force_process=True)
@@ -101,13 +105,13 @@ def postprocess_memory(memory_id: str, file_path: str, uid: str, emotional_feedb
     except Exception as e:
         print(e)
         memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.failed, fail_reason=str(e))
-        return (500, str(e))
+        return 500, str(e)
 
     memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.completed)
     result.postprocessing = MemoryPostProcessing(
         status=PostProcessingStatus.completed, model=PostProcessingModel.fal_whisperx)
 
-    return (200, result)
+    return 200, result
 
 
 def _get_memory_by_id(uid: str, memory_id: str) -> dict:

diff --git a/backend/utils/stt/vad.py b/backend/utils/stt/vad.py
@@ -38,6 +38,8 @@ def is_audio_empty(file_path, sample_rate=8000):
 def vad_is_empty(file_path, return_segments: bool = False):
     """Uses vad_modal/vad.py deployment (Best quality)"""
     try:
+        file_duration = AudioSegment.from_wav(file_path).duration_seconds
+        print('vad_is_empty file duration:', file_duration)
         with open(file_path, 'rb') as file:
             files = {'file': (file_path.split('/')[-1], file, 'audio/wav')}
             response = requests.post(os.getenv('HOSTED_VAD_API_URL'), files=files)