Skip to content

Commit

Permalink
whisper x postprocessing extra logs
Browse files Browse the repository at this point in the history
  • Loading branch information
josancamon19 committed Sep 20, 2024
1 parent b93dfd9 commit 819188f
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 8 deletions.
3 changes: 2 additions & 1 deletion backend/routers/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,11 @@ async def _websocket_util(
if stt_service == STTService.soniox and (
sample_rate != 16000 or codec != 'opus' or language not in soniox_valid_languages):
stt_service = STTService.deepgram

if stt_service == STTService.speechmatics and (sample_rate != 16000 or codec != 'opus'):
stt_service = STTService.deepgram

# At some point try running all the models together to easily compare

# Check: Why do we need try-catch around websocket.accept?
try:
await websocket.accept()
Expand Down
18 changes: 11 additions & 7 deletions backend/utils/memories/postprocess_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,32 +19,36 @@
def postprocess_memory(memory_id: str, file_path: str, uid: str, emotional_feedback: bool, streaming_model: str):
memory_data = _get_memory_by_id(uid, memory_id)
if not memory_data:
return (404, "Memory not found")
return 404, "Memory not found"

memory = Memory(**memory_data)
if memory.discarded:
print('postprocess_memory: Memory is discarded')
return (400, "Memory is discarded")
return 400, "Memory is discarded"

if memory.postprocessing is not None and memory.postprocessing.status != PostProcessingStatus.not_started:
print(f'postprocess_memory: Memory can\'t be post-processed again {memory.postprocessing.status}')
return (400, "Memory can't be post-processed again")
return 400, "Memory can't be post-processed again"

aseg = AudioSegment.from_wav(file_path)
if aseg.duration_seconds < 10: # TODO: validate duration more accurately, segment.last.end - segment.first.start - 10
# TODO: fix app, sometimes audio uploaded is wrong, is too short.
print('postprocess_memory: Audio duration is too short, seems wrong.')
memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.canceled)
return (500, "Audio duration is too short, seems wrong.")
return 500, "Audio duration is too short, seems wrong."

memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.in_progress)

try:
# Calling VAD to avoid processing empty parts and getting hallucinations from whisper.
# TODO: use this logs to determine if whisperx is failing because of the VAD results.
print('previous to vad_is_empty (segments duration):',
memory.transcript_segments[-1].end - memory.transcript_segments[0].start)
vad_segments = vad_is_empty(file_path, return_segments=True)
if vad_segments:
start = vad_segments[0]['start']
end = vad_segments[-1]['end']
print('vad_is_empty file result segments:', start, end)
aseg = AudioSegment.from_wav(file_path)
aseg = aseg[max(0, (start - 1) * 1000):min((end + 1) * 1000, aseg.duration_seconds * 1000)]
aseg.export(file_path, format="wav")
Expand Down Expand Up @@ -90,7 +94,7 @@ def postprocess_memory(memory_id: str, file_path: str, uid: str, emotional_feedb
memory.postprocessing = MemoryPostProcessing(
status=PostProcessingStatus.failed, model=PostProcessingModel.fal_whisperx)
# TODO: consider doing process_memory, if any segment still matched to user or people
return (200, memory)
return 200, memory

# Reprocess memory with improved transcription
result: Memory = process_memory(uid, memory.language, memory, force_process=True)
Expand All @@ -101,13 +105,13 @@ def postprocess_memory(memory_id: str, file_path: str, uid: str, emotional_feedb
except Exception as e:
print(e)
memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.failed, fail_reason=str(e))
return (500, str(e))
return 500, str(e)

memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.completed)
result.postprocessing = MemoryPostProcessing(
status=PostProcessingStatus.completed, model=PostProcessingModel.fal_whisperx)

return (200, result)
return 200, result


def _get_memory_by_id(uid: str, memory_id: str) -> dict:
Expand Down
2 changes: 2 additions & 0 deletions backend/utils/stt/vad.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def is_audio_empty(file_path, sample_rate=8000):
def vad_is_empty(file_path, return_segments: bool = False):
"""Uses vad_modal/vad.py deployment (Best quality)"""
try:
file_duration = AudioSegment.from_wav(file_path).duration_seconds
print('vad_is_empty file duration:', file_duration)
with open(file_path, 'rb') as file:
files = {'file': (file_path.split('/')[-1], file, 'audio/wav')}
response = requests.post(os.getenv('HOSTED_VAD_API_URL'), files=files)
Expand Down

0 comments on commit 819188f

Please sign in to comment.