speechmatics model recognizes speaker id

BasedHardware · Sep 20, 2024 · bb11053 · bb11053
1 parent 0ecf5c6
commit bb11053
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 9 deletions.
diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py
@@ -7,6 +7,7 @@
 import requests
 from fastapi import APIRouter
 from fastapi.websockets import WebSocketDisconnect, WebSocket
+from pydub import AudioSegment
 from starlette.websockets import WebSocketState
 
 import database.memories as memories_db
@@ -205,6 +206,7 @@ def stream_audio(audio_buffer):
 
     soniox_socket = None
     speechmatics_socket = None
+    speechmatics_socket2 = None
     deepgram_socket = None
     deepgram_socket2 = None
 
@@ -240,9 +242,20 @@ def stream_audio(audio_buffer):
                 stream_transcript, speech_profile_stream_id, language, uid if include_speech_profile else None
             )
         elif stt_service == STTService.speechmatics:
+            file_path = None
+            if language == 'en' and codec == 'opus' and include_speech_profile:
+                file_path = get_profile_audio_if_exists(uid)
+                duration = AudioSegment.from_wav(file_path).duration_seconds + 5 if file_path else 0
+
             speechmatics_socket = await process_audio_speechmatics(
-                stream_transcript, speech_profile_stream_id, language, uid if include_speech_profile else None
+                stream_transcript, speech_profile_stream_id, language, preseconds=duration
             )
+            if duration:
+                #     speechmatics_socket2 = await process_audio_speechmatics(
+                #         stream_transcript, speech_profile_stream_id, language, preseconds=duration
+                #     )
+                await send_initial_file_path(file_path, speechmatics_socket)
+                print('speech_profile speechmatics duration', duration)
 
     except Exception as e:
         print(f"Initial processing error: {e}")
@@ -255,7 +268,7 @@ def stream_audio(audio_buffer):
 
     decoder = opuslib.Decoder(sample_rate, channels)
 
-    async def receive_audio(dg_socket1, dg_socket2, soniox_socket, speechmatics_socket):
+    async def receive_audio(dg_socket1, dg_socket2, soniox_socket, speechmatics_socket1):
         nonlocal websocket_active
         nonlocal websocket_close_code
         nonlocal timer_start
@@ -279,9 +292,18 @@ async def receive_audio(dg_socket1, dg_socket2, soniox_socket, speechmatics_sock
                     decoded_opus = decoder.decode(bytes(data), frame_size=160)
                     await soniox_socket.send(decoded_opus)
 
-                if speechmatics_socket is not None:
+                if speechmatics_socket1 is not None:
                     decoded_opus = decoder.decode(bytes(data), frame_size=160)
-                    await speechmatics_socket.send(decoded_opus)
+                    await speechmatics_socket1.send(decoded_opus)
+
+                    # elapsed_seconds = time.time() - timer_start
+                    # if elapsed_seconds > duration or not dg_socket2:
+                    #     if speechmatics_socket2:
+                    #         print('Killing socket2 speechmatics')
+                    #         speechmatics_socket2.close()
+                    #         speechmatics_socket2 = None
+                    # else:
+                    #     speechmatics_socket2.send(decoded_opus)
 
                 if deepgram_socket is not None:
                     elapsed_seconds = time.time() - timer_start
@@ -314,6 +336,8 @@ async def receive_audio(dg_socket1, dg_socket2, soniox_socket, speechmatics_sock
                 await soniox_socket.close()
             if speechmatics_socket:
                 await speechmatics_socket.close()
+            if speechmatics_socket2:
+                await speechmatics_socket2.close()
 
     # heart beat
     async def send_heartbeat():

diff --git a/backend/utils/other/storage.py b/backend/utils/other/storage.py
@@ -149,6 +149,7 @@ def create_signed_postprocessing_audio_url(file_path: str):
     blob = bucket.blob(file_path)
     return _get_signed_url(blob, 15)
 
+
 def upload_postprocessing_audio_bytes(file_path: str, audio_buffer: bytes):
     bucket = storage_client.bucket(postprocessing_audio_bucket)
     blob = bucket.blob(file_path)
@@ -162,13 +163,13 @@ def upload_sdcard_audio(file_path: str):
     blob.upload_from_filename(file_path)
     return f'https://storage.googleapis.com/{postprocessing_audio_bucket}/sdcard/{file_path}'
 
+
 def download_postprocessing_audio(file_path: str, destination_file_path: str):
     bucket = storage_client.bucket(postprocessing_audio_bucket)
     blob = bucket.blob(file_path)
     blob.download_to_filename(destination_file_path)
 
 
-
 # ************************************************
 # ************* MEMORIES RECORDINGS **************
 # ************************************************

diff --git a/backend/utils/stt/streaming.py b/backend/utils/stt/streaming.py
@@ -61,6 +61,22 @@
 #     return segments
 
 
+async def send_initial_file_path(file_path: str, transcript_socket):
+    print('send_initial_file_path')
+    start = time.time()
+    # Reading and sending in chunks
+    with open(file_path, "rb") as file:
+        while True:
+            chunk = file.read(320)
+            if not chunk:
+                break
+            # print('Uploading', len(chunk))
+            await transcript_socket.send(bytes(chunk))
+            await asyncio.sleep(0.0001)  # if it takes too long to transcribe
+
+    print('send_initial_file_path', time.time() - start)
+
+
 async def send_initial_file(data: List[List[int]], transcript_socket):
     print('send_initial_file2')
     start = time.time()
@@ -283,7 +299,7 @@ async def on_message():
 CONNECTION_URL = f"wss://eu2.rt.speechmatics.com/v2"
 
 
-async def process_audio_speechmatics(stream_transcript, stream_id: int, language: str, uid: str):
+async def process_audio_speechmatics(stream_transcript, stream_id: int, language: str, preseconds: int = 0):
     # Create a transcription client
     api_key = os.getenv('SPEECHMATICS_API_KEY')
     uri = 'wss://eu2.rt.speechmatics.com/v2'
@@ -334,9 +350,10 @@ async def on_message():
                             continue
                         segments = []
                         for r in results:
-                            # print(r)
+                            print(r)
                             if not r['alternatives']:
                                 continue
+
                             r_data = r['alternatives'][0]
                             r_type = r['type']  # word | punctuation
                             r_start = r['start_time']
@@ -349,14 +366,19 @@ async def on_message():
                                 continue
                             r_speaker = r_data['speaker'][1:] if r_data['speaker'] != 'UU' else '1'
                             speaker = f"SPEAKER_0{r_speaker}"
+
+                            is_user = True if r_speaker == '1' and preseconds > 0 else False
+                            if r_start < preseconds:
+                                print('Skipping word', r_start, r_content)
+                                continue
                             # print(r_content, r_speaker, [r_start, r_end])
                             if not segments:
                                 segments.append({
                                     'speaker': speaker,
                                     'start': r_start,
                                     'end': r_end,
                                     'text': r_content,
-                                    'is_user': False,
+                                    'is_user': is_user,
                                     'person_id': None,
                                 })
                             else:
@@ -370,7 +392,7 @@ async def on_message():
                                         'start': r_start,
                                         'end': r_end,
                                         'text': r_content,
-                                        'is_user': False,
+                                        'is_user': is_user,
                                         'person_id': None,
                                     })