Skip to content

Commit

Permalink
speechmatics model recognizes speaker id
Browse files Browse the repository at this point in the history
  • Loading branch information
josancamon19 committed Sep 20, 2024
1 parent 0ecf5c6 commit bb11053
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 9 deletions.
32 changes: 28 additions & 4 deletions backend/routers/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import requests
from fastapi import APIRouter
from fastapi.websockets import WebSocketDisconnect, WebSocket
from pydub import AudioSegment
from starlette.websockets import WebSocketState

import database.memories as memories_db
Expand Down Expand Up @@ -205,6 +206,7 @@ def stream_audio(audio_buffer):

soniox_socket = None
speechmatics_socket = None
speechmatics_socket2 = None
deepgram_socket = None
deepgram_socket2 = None

Expand Down Expand Up @@ -240,9 +242,20 @@ def stream_audio(audio_buffer):
stream_transcript, speech_profile_stream_id, language, uid if include_speech_profile else None
)
elif stt_service == STTService.speechmatics:
file_path = None
if language == 'en' and codec == 'opus' and include_speech_profile:
file_path = get_profile_audio_if_exists(uid)
duration = AudioSegment.from_wav(file_path).duration_seconds + 5 if file_path else 0

speechmatics_socket = await process_audio_speechmatics(
stream_transcript, speech_profile_stream_id, language, uid if include_speech_profile else None
stream_transcript, speech_profile_stream_id, language, preseconds=duration
)
if duration:
# speechmatics_socket2 = await process_audio_speechmatics(
# stream_transcript, speech_profile_stream_id, language, preseconds=duration
# )
await send_initial_file_path(file_path, speechmatics_socket)
print('speech_profile speechmatics duration', duration)

except Exception as e:
print(f"Initial processing error: {e}")
Expand All @@ -255,7 +268,7 @@ def stream_audio(audio_buffer):

decoder = opuslib.Decoder(sample_rate, channels)

async def receive_audio(dg_socket1, dg_socket2, soniox_socket, speechmatics_socket):
async def receive_audio(dg_socket1, dg_socket2, soniox_socket, speechmatics_socket1):
nonlocal websocket_active
nonlocal websocket_close_code
nonlocal timer_start
Expand All @@ -279,9 +292,18 @@ async def receive_audio(dg_socket1, dg_socket2, soniox_socket, speechmatics_sock
decoded_opus = decoder.decode(bytes(data), frame_size=160)
await soniox_socket.send(decoded_opus)

if speechmatics_socket is not None:
if speechmatics_socket1 is not None:
decoded_opus = decoder.decode(bytes(data), frame_size=160)
await speechmatics_socket.send(decoded_opus)
await speechmatics_socket1.send(decoded_opus)

# elapsed_seconds = time.time() - timer_start
# if elapsed_seconds > duration or not dg_socket2:
# if speechmatics_socket2:
# print('Killing socket2 speechmatics')
# speechmatics_socket2.close()
# speechmatics_socket2 = None
# else:
# speechmatics_socket2.send(decoded_opus)

if deepgram_socket is not None:
elapsed_seconds = time.time() - timer_start
Expand Down Expand Up @@ -314,6 +336,8 @@ async def receive_audio(dg_socket1, dg_socket2, soniox_socket, speechmatics_sock
await soniox_socket.close()
if speechmatics_socket:
await speechmatics_socket.close()
if speechmatics_socket2:
await speechmatics_socket2.close()

# heart beat
async def send_heartbeat():
Expand Down
3 changes: 2 additions & 1 deletion backend/utils/other/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def create_signed_postprocessing_audio_url(file_path: str):
blob = bucket.blob(file_path)
return _get_signed_url(blob, 15)


def upload_postprocessing_audio_bytes(file_path: str, audio_buffer: bytes):
bucket = storage_client.bucket(postprocessing_audio_bucket)
blob = bucket.blob(file_path)
Expand All @@ -162,13 +163,13 @@ def upload_sdcard_audio(file_path: str):
blob.upload_from_filename(file_path)
return f'https://storage.googleapis.com/{postprocessing_audio_bucket}/sdcard/{file_path}'


def download_postprocessing_audio(file_path: str, destination_file_path: str):
bucket = storage_client.bucket(postprocessing_audio_bucket)
blob = bucket.blob(file_path)
blob.download_to_filename(destination_file_path)



# ************************************************
# ************* MEMORIES RECORDINGS **************
# ************************************************
Expand Down
30 changes: 26 additions & 4 deletions backend/utils/stt/streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,22 @@
# return segments


async def send_initial_file_path(file_path: str, transcript_socket):
print('send_initial_file_path')
start = time.time()
# Reading and sending in chunks
with open(file_path, "rb") as file:
while True:
chunk = file.read(320)
if not chunk:
break
# print('Uploading', len(chunk))
await transcript_socket.send(bytes(chunk))
await asyncio.sleep(0.0001) # if it takes too long to transcribe

print('send_initial_file_path', time.time() - start)


async def send_initial_file(data: List[List[int]], transcript_socket):
print('send_initial_file2')
start = time.time()
Expand Down Expand Up @@ -283,7 +299,7 @@ async def on_message():
CONNECTION_URL = f"wss://eu2.rt.speechmatics.com/v2"


async def process_audio_speechmatics(stream_transcript, stream_id: int, language: str, uid: str):
async def process_audio_speechmatics(stream_transcript, stream_id: int, language: str, preseconds: int = 0):
# Create a transcription client
api_key = os.getenv('SPEECHMATICS_API_KEY')
uri = 'wss://eu2.rt.speechmatics.com/v2'
Expand Down Expand Up @@ -334,9 +350,10 @@ async def on_message():
continue
segments = []
for r in results:
# print(r)
print(r)
if not r['alternatives']:
continue

r_data = r['alternatives'][0]
r_type = r['type'] # word | punctuation
r_start = r['start_time']
Expand All @@ -349,14 +366,19 @@ async def on_message():
continue
r_speaker = r_data['speaker'][1:] if r_data['speaker'] != 'UU' else '1'
speaker = f"SPEAKER_0{r_speaker}"

is_user = True if r_speaker == '1' and preseconds > 0 else False
if r_start < preseconds:
print('Skipping word', r_start, r_content)
continue
# print(r_content, r_speaker, [r_start, r_end])
if not segments:
segments.append({
'speaker': speaker,
'start': r_start,
'end': r_end,
'text': r_content,
'is_user': False,
'is_user': is_user,
'person_id': None,
})
else:
Expand All @@ -370,7 +392,7 @@ async def on_message():
'start': r_start,
'end': r_end,
'text': r_content,
'is_user': False,
'is_user': is_user,
'person_id': None,
})

Expand Down

0 comments on commit bb11053

Please sign in to comment.