From 2229df9d95079b76591f9ae69d6da0dc46582952 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 22 Sep 2024 12:04:04 +0700 Subject: [PATCH 01/10] Align start seconds for transcript segment --- backend/routers/transcribe.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index 8e7f20fdd..9047867e4 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -139,10 +139,21 @@ def stream_transcript(segments, stream_id): nonlocal processing_memory nonlocal processing_memory_synced nonlocal memory_transcript_segements + nonlocal segment_start if not segments or len(segments) == 0: return + # Align the start, end segment + if len(memory_transcript_segements) == 0 and len(segments) > 0: + start = segments[0]["start"] + if not segment_start or segment_start > start: + segment_start = start + for i, segment in enumerate(segments): + segment["start"] -= segment_start + segment["end"] -= segment_start + segments[i] = segment + asyncio.run_coroutine_threadsafe(websocket.send_json(segments), loop) threading.Thread(target=process_segments, args=(uid, segments)).start() @@ -178,6 +189,8 @@ def stream_audio(audio_buffer): websocket_active = True websocket_close_code = 1001 # Going Away, don't close with good from backend timer_start = None + segment_start = None + segment_end = None # audio_buffer = None duration = 0 try: @@ -377,6 +390,7 @@ async def _post_process_memory(memory: Memory): nonlocal processing_audio_frame_synced # Create wav + # TODO: remove audio frames [start, end] processing_audio_frame_synced = len(processing_audio_frames) file_path = f"_temp/{memory.id}_{uuid.uuid4()}_be" create_wav_from_bytes(file_path=file_path, frames=processing_audio_frames[:processing_audio_frame_synced], From 60376d943063ebdc73a2f14697d730484a991eb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 22 Sep 2024 13:33:02 +0700 Subject: [PATCH 02/10] Trim the audio frames regarding the start, end --- backend/routers/transcribe.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index 9047867e4..29136022c 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -1,4 +1,5 @@ import threading +import math import asyncio import time from typing import List @@ -140,15 +141,23 @@ def stream_transcript(segments, stream_id): nonlocal processing_memory_synced nonlocal memory_transcript_segements nonlocal segment_start + nonlocal segment_end if not segments or len(segments) == 0: return # Align the start, end segment - if len(memory_transcript_segements) == 0 and len(segments) > 0: - start = segments[0]["start"] - if not segment_start or segment_start > start: + if len(segments) > 0: + # start + if not segment_start: + start = segments[0]["start"] segment_start = start + + # end + end = segments[-1]["end"] + if not segment_end or segment_end < end: + segment_end = end + for i, segment in enumerate(segments): segment["start"] -= segment_start segment["end"] -= segment_start @@ -388,12 +397,23 @@ async def _post_process_memory(memory: Memory): nonlocal processing_memory nonlocal processing_audio_frames nonlocal processing_audio_frame_synced + nonlocal segment_start + nonlocal segment_end # Create wav - # TODO: remove audio frames [start, end] processing_audio_frame_synced = len(processing_audio_frames) + + # Remove audio frames [start, end] + frames_per_sec = 100 + left = 0 + if segment_start: + left = max(left, math.floor(segment_start * frames_per_sec)) + right = processing_audio_frame_synced + if segment_end: + right = min(math.ceil(segment_end * frames_per_sec), right) + file_path = f"_temp/{memory.id}_{uuid.uuid4()}_be" - create_wav_from_bytes(file_path=file_path, frames=processing_audio_frames[:processing_audio_frame_synced], + create_wav_from_bytes(file_path=file_path, frames=processing_audio_frames[left:right], frame_rate=sample_rate, channels=channels, codec=codec, ) # Try merge new audio with the previous From 8010780ad11e8d985ce8e9d5f37f865e8c90a278 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 22 Sep 2024 13:40:45 +0700 Subject: [PATCH 03/10] Create memory with delta segment start --- backend/utils/processing_memories.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/utils/processing_memories.py b/backend/utils/processing_memories.py index 4cff65121..f22f00dca 100644 --- a/backend/utils/processing_memories.py +++ b/backend/utils/processing_memories.py @@ -24,10 +24,11 @@ async def create_memory_by_processing_memory(uid: str, processing_memory_id: str print("Transcript segments is invalid") return timer_start = processing_memory.timer_start + segment_start = transcript_segments[0].start segment_end = transcript_segments[-1].end new_memory = CreateMemory( - started_at=datetime.fromtimestamp(timer_start, timezone.utc), - finished_at=datetime.fromtimestamp(timer_start + segment_end, timezone.utc), + started_at=datetime.fromtimestamp(timer_start + segment_start, timezone.utc), + finished_at=datetime.fromtimestamp(timer_start + segment_start + segment_end, timezone.utc), language=processing_memory.language, transcript_segments=transcript_segments, ) From a306ce5a925bc2c1467976b3dbb6be424df0dddb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 22 Sep 2024 13:44:19 +0700 Subject: [PATCH 04/10] Remove redundant len check on new segments --- backend/routers/transcribe.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index 29136022c..acdff7c4b 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -147,16 +147,14 @@ def stream_transcript(segments, stream_id): return # Align the start, end segment - if len(segments) > 0: - # start - if not segment_start: - start = segments[0]["start"] - segment_start = start + if not segment_start: + start = segments[0]["start"] + segment_start = start - # end - end = segments[-1]["end"] - if not segment_end or segment_end < end: - segment_end = end + # end + end = segments[-1]["end"] + if not segment_end or segment_end < end: + segment_end = end for i, segment in enumerate(segments): segment["start"] -= segment_start @@ -200,6 +198,7 @@ def stream_audio(audio_buffer): timer_start = None segment_start = None segment_end = None + audio_frames_per_sec = 100 # audio_buffer = None duration = 0 try: @@ -404,13 +403,12 @@ async def _post_process_memory(memory: Memory): processing_audio_frame_synced = len(processing_audio_frames) # Remove audio frames [start, end] - frames_per_sec = 100 left = 0 if segment_start: - left = max(left, math.floor(segment_start * frames_per_sec)) + left = max(left, math.floor(segment_start * audio_frames_per_sec)) right = processing_audio_frame_synced if segment_end: - right = min(math.ceil(segment_end * frames_per_sec), right) + right = min(math.ceil(segment_end * audio_frames_per_sec), right) file_path = f"_temp/{memory.id}_{uuid.uuid4()}_be" create_wav_from_bytes(file_path=file_path, frames=processing_audio_frames[left:right], From 978957d6e3cf3e634ed5604ad4ec82745b59ba34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 22 Sep 2024 14:07:06 +0700 Subject: [PATCH 05/10] Use transcript timer segment start instead of timer start(audio) for started at, finished at memory --- backend/models/processing_memory.py | 1 + backend/routers/transcribe.py | 23 +++++++++++------------ backend/utils/processing_memories.py | 7 +++---- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/backend/models/processing_memory.py b/backend/models/processing_memory.py index c5ac29f08..11d812068 100644 --- a/backend/models/processing_memory.py +++ b/backend/models/processing_memory.py @@ -14,6 +14,7 @@ class ProcessingMemory(BaseModel): audio_url: Optional[str] = None created_at: datetime timer_start: float + timer_segment_start: Optional[float] = None timer_starts: List[float] = [] language: Optional[str] = None # applies only to Friend # TODO: once released migrate db to default 'en' transcript_segments: List[TranscriptSegment] = [] diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index acdff7c4b..a16bc595d 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -352,10 +352,11 @@ async def _create_processing_memory(): last_processing_memory_data = processing_memories_db.get_last(uid) if last_processing_memory_data: last_processing_memory = ProcessingMemory(**last_processing_memory_data) - segment_end = 0 + last_segment_end = 0 for segment in last_processing_memory.transcript_segments: - segment_end = max(segment_end, segment.end) - if last_processing_memory.timer_start + segment_end + min_seconds_limit > time.time(): + last_segment_end = max(last_segment_end, segment.end) + timer_segment_start = last_processing_memory.timer_segment_start if last_processing_memory.timer_segment_start else last_processing_memory.timer_start + if timer_segment_start + last_segment_end + min_seconds_limit > time.time(): processing_memory = last_processing_memory # Or create new @@ -364,6 +365,7 @@ async def _create_processing_memory(): id=str(uuid.uuid4()), created_at=datetime.now(timezone.utc), timer_start=timer_start, + timer_segment_start=timer_start+segment_start, language=language, ) @@ -557,6 +559,8 @@ async def _try_flush_new_memory_with_lock(time_validate: bool = True): async def _try_flush_new_memory(time_validate: bool = True): nonlocal memory_transcript_segements nonlocal timer_start + nonlocal segment_start + nonlocal segment_end nonlocal processing_memory nonlocal processing_memory_synced nonlocal processing_audio_frames @@ -567,13 +571,8 @@ async def _try_flush_new_memory(time_validate: bool = True): return # Validate last segment - last_segment = None - if len(memory_transcript_segements) > 0: - last_segment = memory_transcript_segements[-1] - if not last_segment: + if not segment_end: print("Not last segment or last segment invalid") - if last_segment: - print(f"{last_segment.dict()}") return # First chunk, create processing memory @@ -584,11 +583,11 @@ async def _try_flush_new_memory(time_validate: bool = True): # Validate transcript # Longer 120s - segment_end = last_segment.end now = time.time() should_create_memory_time = True if time_validate: - should_create_memory_time = timer_start + segment_end + min_seconds_limit < now + timer_segment_start = timer_start + segment_start + should_create_memory_time = timer_segment_start + segment_end + min_seconds_limit < now # 1 words at least should_create_memory_time_words = min_words_limit == 0 @@ -602,7 +601,7 @@ async def _try_flush_new_memory(time_validate: bool = True): should_create_memory = should_create_memory_time and should_create_memory_time_words print( - f"Should create memory {should_create_memory} - {timer_start} {segment_end} {min_seconds_limit} {now} - {time_validate}, session {session_id}") + f"Should create memory {should_create_memory} - {timer_segment_start} {segment_end} {min_seconds_limit} {now} - {time_validate}, session {session_id}") if should_create_memory: memory = await _create_memory() if not memory: diff --git a/backend/utils/processing_memories.py b/backend/utils/processing_memories.py index f22f00dca..eeaabb9b2 100644 --- a/backend/utils/processing_memories.py +++ b/backend/utils/processing_memories.py @@ -23,12 +23,11 @@ async def create_memory_by_processing_memory(uid: str, processing_memory_id: str if not transcript_segments or len(transcript_segments) == 0: print("Transcript segments is invalid") return - timer_start = processing_memory.timer_start - segment_start = transcript_segments[0].start + timer_segment_start = processing_memory.timer_segment_start segment_end = transcript_segments[-1].end new_memory = CreateMemory( - started_at=datetime.fromtimestamp(timer_start + segment_start, timezone.utc), - finished_at=datetime.fromtimestamp(timer_start + segment_start + segment_end, timezone.utc), + started_at=datetime.fromtimestamp(timer_segment_start, timezone.utc), + finished_at=datetime.fromtimestamp(timer_segment_start + segment_end, timezone.utc), language=processing_memory.language, transcript_segments=transcript_segments, ) From bd95410f8958c9c1965fde10582cda88f15aaae3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 22 Sep 2024 14:10:44 +0700 Subject: [PATCH 06/10] Backward compatbile with the old ProcessingMemory model --- backend/utils/processing_memories.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/utils/processing_memories.py b/backend/utils/processing_memories.py index eeaabb9b2..057cbd0c6 100644 --- a/backend/utils/processing_memories.py +++ b/backend/utils/processing_memories.py @@ -23,7 +23,7 @@ async def create_memory_by_processing_memory(uid: str, processing_memory_id: str if not transcript_segments or len(transcript_segments) == 0: print("Transcript segments is invalid") return - timer_segment_start = processing_memory.timer_segment_start + timer_segment_start = processing_memory.timer_segment_start if processing_memory.timer_segment_start else processing_memory.timer_start segment_end = transcript_segments[-1].end new_memory = CreateMemory( started_at=datetime.fromtimestamp(timer_segment_start, timezone.utc), From 8465de643dd45c500f0f448fabbd4e5a28beadc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 22 Sep 2024 16:59:21 +0700 Subject: [PATCH 07/10] Trim audio frames --- backend/routers/transcribe.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index a16bc595d..62bb895cd 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -260,7 +260,8 @@ async def receive_audio(dg_socket1, dg_socket2, soniox_socket, speechmatics_sock # audio_file = open(path, "a") try: while websocket_active: - data = await websocket.receive_bytes() + raw_data = await websocket.receive_bytes() + data = raw_data[:] # audio_buffer.extend(data) if codec == 'opus' and sample_rate == 16000: data = decoder.decode(bytes(data), frame_size=160) @@ -283,7 +284,7 @@ async def receive_audio(dg_socket1, dg_socket2, soniox_socket, speechmatics_sock dg_socket2.send(data) # stream - stream_audio(data) + stream_audio(raw_data) # audio_buffer = bytearray() @@ -407,10 +408,10 @@ async def _post_process_memory(memory: Memory): # Remove audio frames [start, end] left = 0 if segment_start: - left = max(left, math.floor(segment_start * audio_frames_per_sec)) + left = max(left, math.floor(segment_start) * audio_frames_per_sec) right = processing_audio_frame_synced if segment_end: - right = min(math.ceil(segment_end * audio_frames_per_sec), right) + right = min(math.ceil(segment_end) * audio_frames_per_sec, right) file_path = f"_temp/{memory.id}_{uuid.uuid4()}_be" create_wav_from_bytes(file_path=file_path, frames=processing_audio_frames[left:right], @@ -585,8 +586,8 @@ async def _try_flush_new_memory(time_validate: bool = True): # Longer 120s now = time.time() should_create_memory_time = True + timer_segment_start = timer_start + segment_start if time_validate: - timer_segment_start = timer_start + segment_start should_create_memory_time = timer_segment_start + segment_end + min_seconds_limit < now # 1 words at least From 60b33c4be9a3cc93da7af270a634e34b66e884ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 22 Sep 2024 17:09:43 +0700 Subject: [PATCH 08/10] Use timmer start to validate create new memory --- backend/routers/transcribe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index 62bb895cd..a1ac62bfa 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -586,9 +586,8 @@ async def _try_flush_new_memory(time_validate: bool = True): # Longer 120s now = time.time() should_create_memory_time = True - timer_segment_start = timer_start + segment_start if time_validate: - should_create_memory_time = timer_segment_start + segment_end + min_seconds_limit < now + should_create_memory_time = timer_start + segment_end + min_seconds_limit < now # 1 words at least should_create_memory_time_words = min_words_limit == 0 @@ -602,7 +601,7 @@ async def _try_flush_new_memory(time_validate: bool = True): should_create_memory = should_create_memory_time and should_create_memory_time_words print( - f"Should create memory {should_create_memory} - {timer_segment_start} {segment_end} {min_seconds_limit} {now} - {time_validate}, session {session_id}") + f"Should create memory {should_create_memory} - {timer_start} {segment_end} {min_seconds_limit} {now} - {time_validate}, session {session_id}") if should_create_memory: memory = await _create_memory() if not memory: From 35027e28239f199e03ff0eec6b61594be0197534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 22 Sep 2024 17:35:08 +0700 Subject: [PATCH 09/10] Update finished at for combined memory --- backend/database/memories.py | 5 +++++ backend/routers/transcribe.py | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/backend/database/memories.py b/backend/database/memories.py index 299cb6eb2..7bc30f07e 100644 --- a/backend/database/memories.py +++ b/backend/database/memories.py @@ -138,6 +138,11 @@ def update_memory_events(uid: str, memory_id: str, events: List[dict]): memory_ref = user_ref.collection('memories').document(memory_id) memory_ref.update({'structured.events': events}) +def update_memory_finished_at(uid: str, memory_id: str, finished_at: datetime): + user_ref = db.collection('users').document(uid) + memory_ref = user_ref.collection('memories').document(memory_id) + memory_ref.update({'finished_at': finished_at}) + # VISBILITY diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index a1ac62bfa..49a9463ad 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -524,6 +524,10 @@ async def _create_memory(): memories_db.update_memory_segments(uid, memory.id, [segment.dict() for segment in memory.transcript_segments]) + # Update finished at + memory.finished_at = datetime.fromtimestamp(memory.started_at.timestamp() + processing_memory.transcript_segments[-1].end, timezone.utc) + memories_db.update_memory_finished_at(uid, memory.id, memory.finished_at) + # Process memory = process_memory(uid, memory.language, memory, force_process=True) From 461d062bf1abc688ab921331f9d7fab2a368290e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=E1=BB=8Bnh?= Date: Sun, 22 Sep 2024 17:59:18 +0700 Subject: [PATCH 10/10] Add silent seconds between 2 combining audio --- backend/routers/transcribe.py | 3 ++- backend/utils/audio.py | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py index 49a9463ad..26a9267f3 100644 --- a/backend/routers/transcribe.py +++ b/backend/routers/transcribe.py @@ -428,7 +428,8 @@ async def _post_process_memory(memory: Memory): # merge merge_file_path = f"_temp/{memory.id}_{uuid.uuid4()}_be" - merge_wav_files(merge_file_path, [previous_file_path, file_path]) + nearest_timer_start = processing_memory.timer_starts[-2] + merge_wav_files(merge_file_path, [previous_file_path, file_path], [math.ceil(timer_start-nearest_timer_start), 0]) # clean os.remove(previous_file_path) diff --git a/backend/utils/audio.py b/backend/utils/audio.py index 069a2ebdd..bf4663c73 100644 --- a/backend/utils/audio.py +++ b/backend/utils/audio.py @@ -3,14 +3,16 @@ from pyogg import OpusDecoder from pydub import AudioSegment -def merge_wav_files(dest_file_path: str, source_files: [str]): +def merge_wav_files(dest_file_path: str, source_files: [str], silent_seconds: [int]): if len(source_files) == 0 or not dest_file_path: return combined_sounds = AudioSegment.empty() - for file_path in source_files: + for i in range(len(source_files)): + file_path = source_files[i] sound = AudioSegment.from_wav(file_path) - combined_sounds = combined_sounds + sound + silent_sec = silent_seconds[i] + combined_sounds = combined_sounds + sound + AudioSegment.silent(duration=silent_sec) combined_sounds.export(dest_file_path, format="wav")