-
Notifications
You must be signed in to change notification settings - Fork 471
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Align start seconds for transcript segment #906
Changes from all commits
2229df9
60376d9
8010780
a306ce5
978957d
bd95410
8465de6
60b33c4
35027e2
461d062
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
import threading | ||
import math | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
import asyncio | ||
import time | ||
from typing import List | ||
|
@@ -139,10 +140,27 @@ def stream_transcript(segments, stream_id): | |
nonlocal processing_memory | ||
nonlocal processing_memory_synced | ||
nonlocal memory_transcript_segements | ||
nonlocal segment_start | ||
nonlocal segment_end | ||
|
||
if not segments or len(segments) == 0: | ||
return | ||
|
||
# Align the start, end segment | ||
if not segment_start: | ||
start = segments[0]["start"] | ||
segment_start = start | ||
|
||
# end | ||
end = segments[-1]["end"] | ||
if not segment_end or segment_end < end: | ||
segment_end = end | ||
|
||
for i, segment in enumerate(segments): | ||
segment["start"] -= segment_start | ||
segment["end"] -= segment_start | ||
segments[i] = segment | ||
|
||
Comment on lines
+143
to
+163
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic for aligning the start and end of segments seems correct. However, there is no error handling or validation for the # Align the start, end segment
if not segment_start:
start = segments[0].get("start")
if start is None:
raise ValueError("Each segment should have a 'start' field.")
segment_start = start
# end
end = segments[-1].get("end")
if end is None:
raise ValueError("Each segment should have an 'end' field.")
if not segment_end or segment_end < end:
segment_end = end
for i, segment in enumerate(segments):
if "start" not in segment or "end" not in segment:
raise ValueError("Each segment should have 'start' and 'end' fields.")
segment["start"] -= segment_start
segment["end"] -= segment_start
segments[i] = segment |
||
asyncio.run_coroutine_threadsafe(websocket.send_json(segments), loop) | ||
threading.Thread(target=process_segments, args=(uid, segments)).start() | ||
|
||
|
@@ -178,6 +196,9 @@ def stream_audio(audio_buffer): | |
websocket_active = True | ||
websocket_close_code = 1001 # Going Away, don't close with good from backend | ||
timer_start = None | ||
segment_start = None | ||
segment_end = None | ||
audio_frames_per_sec = 100 | ||
# audio_buffer = None | ||
duration = 0 | ||
try: | ||
|
@@ -239,7 +260,8 @@ async def receive_audio(dg_socket1, dg_socket2, soniox_socket, speechmatics_sock | |
# audio_file = open(path, "a") | ||
try: | ||
while websocket_active: | ||
data = await websocket.receive_bytes() | ||
raw_data = await websocket.receive_bytes() | ||
data = raw_data[:] | ||
Comment on lines
+263
to
+264
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
# audio_buffer.extend(data) | ||
if codec == 'opus' and sample_rate == 16000: | ||
data = decoder.decode(bytes(data), frame_size=160) | ||
|
@@ -262,7 +284,7 @@ async def receive_audio(dg_socket1, dg_socket2, soniox_socket, speechmatics_sock | |
dg_socket2.send(data) | ||
|
||
# stream | ||
stream_audio(data) | ||
stream_audio(raw_data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
# audio_buffer = bytearray() | ||
|
||
|
@@ -331,10 +353,11 @@ async def _create_processing_memory(): | |
last_processing_memory_data = processing_memories_db.get_last(uid) | ||
if last_processing_memory_data: | ||
last_processing_memory = ProcessingMemory(**last_processing_memory_data) | ||
segment_end = 0 | ||
last_segment_end = 0 | ||
for segment in last_processing_memory.transcript_segments: | ||
segment_end = max(segment_end, segment.end) | ||
if last_processing_memory.timer_start + segment_end + min_seconds_limit > time.time(): | ||
last_segment_end = max(last_segment_end, segment.end) | ||
timer_segment_start = last_processing_memory.timer_segment_start if last_processing_memory.timer_segment_start else last_processing_memory.timer_start | ||
if timer_segment_start + last_segment_end + min_seconds_limit > time.time(): | ||
Comment on lines
+356
to
+360
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The variable name - last_segment_end = 0
+ max_segment_end = 0
- for segment in last_processing_memory.transcript_segments:
- last_segment_end = max(last_segment_end, segment.end)
+ for segment in last_processing_memory.transcript_segments:
+ max_segment_end = max(max_segment_end, segment.end) |
||
processing_memory = last_processing_memory | ||
|
||
# Or create new | ||
|
@@ -343,6 +366,7 @@ async def _create_processing_memory(): | |
id=str(uuid.uuid4()), | ||
created_at=datetime.now(timezone.utc), | ||
timer_start=timer_start, | ||
timer_segment_start=timer_start+segment_start, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
language=language, | ||
) | ||
|
||
|
@@ -375,11 +399,22 @@ async def _post_process_memory(memory: Memory): | |
nonlocal processing_memory | ||
nonlocal processing_audio_frames | ||
nonlocal processing_audio_frame_synced | ||
nonlocal segment_start | ||
nonlocal segment_end | ||
|
||
# Create wav | ||
processing_audio_frame_synced = len(processing_audio_frames) | ||
|
||
# Remove audio frames [start, end] | ||
left = 0 | ||
if segment_start: | ||
left = max(left, math.floor(segment_start) * audio_frames_per_sec) | ||
right = processing_audio_frame_synced | ||
if segment_end: | ||
right = min(math.ceil(segment_end) * audio_frames_per_sec, right) | ||
|
||
Comment on lines
+402
to
+415
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic for creating a WAV file seems correct. However, there is no error handling or validation for the if segment_start:
+ if not isinstance(segment_start, (int, float)):
+ raise TypeError("`segment_start` should be a number.")
left = max(left, math.floor(segment_start) * audio_frames_per_sec)
if segment_end:
+ if not isinstance(segment_end, (int, float)):
+ raise TypeError("`segment_end` should be a number.")
right = min(math.ceil(segment_end) * audio_frames_per_sec, right) |
||
file_path = f"_temp/{memory.id}_{uuid.uuid4()}_be" | ||
create_wav_from_bytes(file_path=file_path, frames=processing_audio_frames[:processing_audio_frame_synced], | ||
create_wav_from_bytes(file_path=file_path, frames=processing_audio_frames[left:right], | ||
Comment on lines
+402
to
+417
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic for creating a WAV file from a subset of audio frames based on segment start and end times seems correct. However, there is no error handling or validation for the # Remove audio frames [start, end]
left = 0
if segment_start:
left = max(left, math.floor(segment_start) * audio_frames_per_sec)
right = processing_audio_frame_synced
if segment_end:
right = min(math.ceil(segment_end) * audio_frames_per_sec, right)
file_path = f"_temp/{memory.id}_{uuid.uuid4()}_be"
create_wav_from_bytes(file_path=file_path, frames=processing_audio_frames[left:right],
frame_rate=sample_rate, channels=channels, codec=codec, ) |
||
frame_rate=sample_rate, channels=channels, codec=codec, ) | ||
|
||
# Try merge new audio with the previous | ||
|
@@ -393,7 +428,8 @@ async def _post_process_memory(memory: Memory): | |
|
||
# merge | ||
merge_file_path = f"_temp/{memory.id}_{uuid.uuid4()}_be" | ||
merge_wav_files(merge_file_path, [previous_file_path, file_path]) | ||
nearest_timer_start = processing_memory.timer_starts[-2] | ||
merge_wav_files(merge_file_path, [previous_file_path, file_path], [math.ceil(timer_start-nearest_timer_start), 0]) | ||
Comment on lines
+431
to
+432
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The - merge_wav_files(merge_file_path, [previous_file_path, file_path])
+ merge_wav_files(merge_file_path, [previous_file_path, file_path], [math.ceil(timer_start-nearest_timer_start), 0]) |
||
|
||
# clean | ||
os.remove(previous_file_path) | ||
|
@@ -489,6 +525,10 @@ async def _create_memory(): | |
memories_db.update_memory_segments(uid, memory.id, | ||
[segment.dict() for segment in memory.transcript_segments]) | ||
|
||
# Update finished at | ||
memory.finished_at = datetime.fromtimestamp(memory.started_at.timestamp() + processing_memory.transcript_segments[-1].end, timezone.utc) | ||
memories_db.update_memory_finished_at(uid, memory.id, memory.finished_at) | ||
|
||
# Process | ||
memory = process_memory(uid, memory.language, memory, force_process=True) | ||
|
||
|
@@ -525,6 +565,8 @@ async def _try_flush_new_memory_with_lock(time_validate: bool = True): | |
async def _try_flush_new_memory(time_validate: bool = True): | ||
nonlocal memory_transcript_segements | ||
nonlocal timer_start | ||
nonlocal segment_start | ||
nonlocal segment_end | ||
nonlocal processing_memory | ||
nonlocal processing_memory_synced | ||
nonlocal processing_audio_frames | ||
|
@@ -535,13 +577,8 @@ async def _try_flush_new_memory(time_validate: bool = True): | |
return | ||
|
||
# Validate last segment | ||
last_segment = None | ||
if len(memory_transcript_segements) > 0: | ||
last_segment = memory_transcript_segements[-1] | ||
if not last_segment: | ||
if not segment_end: | ||
print("Not last segment or last segment invalid") | ||
Comment on lines
+580
to
581
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The validation for if not segment_end:
print("Not last segment or last segment invalid")
+ return |
||
if last_segment: | ||
print(f"{last_segment.dict()}") | ||
return | ||
|
||
# First chunk, create processing memory | ||
|
@@ -552,7 +589,6 @@ async def _try_flush_new_memory(time_validate: bool = True): | |
|
||
# Validate transcript | ||
# Longer 120s | ||
segment_end = last_segment.end | ||
now = time.time() | ||
should_create_memory_time = True | ||
if time_validate: | ||
Comment on lines
589
to
594
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,14 +3,16 @@ | |
from pyogg import OpusDecoder | ||
from pydub import AudioSegment | ||
|
||
def merge_wav_files(dest_file_path: str, source_files: [str]): | ||
def merge_wav_files(dest_file_path: str, source_files: [str], silent_seconds: [int]): | ||
if len(source_files) == 0 or not dest_file_path: | ||
return | ||
|
||
combined_sounds = AudioSegment.empty() | ||
for file_path in source_files: | ||
for i in range(len(source_files)): | ||
file_path = source_files[i] | ||
sound = AudioSegment.from_wav(file_path) | ||
combined_sounds = combined_sounds + sound | ||
silent_sec = silent_seconds[i] | ||
combined_sounds = combined_sounds + sound + AudioSegment.silent(duration=silent_sec) | ||
Comment on lines
+6
to
+15
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The new implementation of + if len(source_files) != len(silent_seconds):
+ raise ValueError("source_files and silent_seconds must be of the same length") Also, consider handling the case where + if any(sec < 0 for sec in silent_seconds):
+ raise ValueError("All durations in silent_seconds must be non-negative") Lastly, the current implementation will add a silent segment after the last audio file as well. If this is not intended, you might want to adjust the loop to skip adding silence after the last audio file. - for i in range(len(source_files)):
+ for i in range(len(source_files) - 1):
file_path = source_files[i]
sound = AudioSegment.from_wav(file_path)
silent_sec = silent_seconds[i]
combined_sounds = combined_sounds + sound + AudioSegment.silent(duration=silent_sec)
+ combined_sounds += AudioSegment.from_wav(source_files[-1]) This way, silence is added only between audio files, not after the last one. |
||
combined_sounds.export(dest_file_path, format="wav") | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,11 +23,11 @@ async def create_memory_by_processing_memory(uid: str, processing_memory_id: str | |
if not transcript_segments or len(transcript_segments) == 0: | ||
print("Transcript segments is invalid") | ||
return | ||
timer_start = processing_memory.timer_start | ||
timer_segment_start = processing_memory.timer_segment_start if processing_memory.timer_segment_start else processing_memory.timer_start | ||
segment_end = transcript_segments[-1].end | ||
new_memory = CreateMemory( | ||
started_at=datetime.fromtimestamp(timer_start, timezone.utc), | ||
finished_at=datetime.fromtimestamp(timer_start + segment_end, timezone.utc), | ||
started_at=datetime.fromtimestamp(timer_segment_start, timezone.utc), | ||
finished_at=datetime.fromtimestamp(timer_segment_start + segment_end, timezone.utc), | ||
Comment on lines
+26
to
+30
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The changes made here are logical and seem to be in line with the PR's goal of aligning start seconds for transcript segments. However, there is a potential issue with error handling. If - timer_segment_start = processing_memory.timer_segment_start if processing_memory.timer_segment_start else processing_memory.timer_start
+ timer_segment_start = processing_memory.timer_segment_start if processing_memory.timer_segment_start else processing_memory.timer_start
+ if timer_segment_start is None:
+ print("Timer segment start is invalid")
+ return This way, we can avoid unexpected exceptions and provide a more informative error message. |
||
language=processing_memory.language, | ||
transcript_segments=transcript_segments, | ||
) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Entelligence.AI
The addition of
timer_segment_start
as an optional field in theProcessingMemory
class seems fine. However, it would be beneficial to add a comment explaining what this new field represents and how it differs fromtimer_start
. This will help other developers understand its purpose more easily.