Skip to content

Commit

Permalink
Fix speech profile but only for pcm8 and english
Browse files Browse the repository at this point in the history
  • Loading branch information
josancamon19 committed Aug 9, 2024
1 parent 84e15f1 commit 7539e2e
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 26 deletions.
1 change: 1 addition & 0 deletions app/lib/pages/capture/page.dart
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class CapturePageState extends State<CapturePage>

Future<void> initiateWebsocket([BleAudioCodec? audioCodec, int? sampleRate]) async {
BleAudioCodec codec = audioCodec ?? (btDevice?.id == null ? BleAudioCodec.pcm8 : await getAudioCodec(btDevice!.id));
int sampleRate = (codec == BleAudioCodec.opus ? 16000 : 8000);
await initWebSocket(
codec: codec,
sampleRate: sampleRate,
Expand Down
13 changes: 5 additions & 8 deletions backend/routers/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,10 @@ async def _websocket_util(
transcript_socket2 = None
duration = 0
try:
# if language == 'en':
# # TODO: if pcm16? meaning phone recording, should ignore?
# single_file_path, duration = get_speaker_audio_file(uid, target_sample_rate=sample_rate)
# else:
# single_file_path, duration = None, 0
single_file_path, duration = None, 0
# TODO: what if opus? and the samples were created with pcm?
if language == 'en' and codec == 'pcm8': # no pcm16 which is phone recording, no opus
single_file_path, duration = get_speaker_audio_file(uid, target_sample_rate=sample_rate)
else:
single_file_path, duration = None, 0
transcript_socket = await process_audio_dg(websocket, language, sample_rate, codec, channels,
preseconds=duration)
if duration:
Expand Down Expand Up @@ -118,7 +115,7 @@ async def receive_audio(socket1, socket2):
# continue
#
# audio_buffer = audio_buffer[window_size_samples * 2:]

# print(data)
elapsed_seconds = time.time() - timer_start
if elapsed_seconds > duration or not socket2:
socket1.send(data)
Expand Down
2 changes: 1 addition & 1 deletion backend/utils/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def retrieve_all_samples(uid: str):

for i, blob in enumerate(blobs):
path = f'{base_path}{blob.name.split("/")[-1]}'
if os.path.exists(path):
if os.path.exists(path): # when opus uploaded? should refresh the download
continue
try:
blob.download_to_filename(path)
Expand Down
37 changes: 20 additions & 17 deletions backend/utils/stt/deepgram_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,17 @@ def get_single_file(dir_path: str, target_sample_rate: int = 8000):
for sample in os.listdir(dir_path):
if '.wav' not in sample:
continue
if 'joined_output.wav' in sample:
continue
path = f'{dir_path}{sample}'
aseg = AudioSegment.from_file(path)
if aseg.frame_rate != target_sample_rate:
print('converting', aseg.frame_rate, 'to', target_sample_rate)
aseg = aseg.set_frame_rate(target_sample_rate)
files_to_join.append(aseg)
print(path, aseg.frame_rate)
if aseg.frame_rate == target_sample_rate:
files_to_join.append(aseg)

output = files_to_join[0] # KNOWN ISSUE
if not files_to_join:
return None
output = files_to_join[0]
for audio in files_to_join[1:]:
output += audio

Expand All @@ -117,17 +120,17 @@ def get_speaker_audio_file(uid: str, target_sample_rate: int = 8000) -> Tuple[Op
print('get_speaker_audio_file', uid, path, 'Files:', files_at_path)
if files_at_path < 5: # means user did less than 5 samples unfortunately, so not completed
return None, 0

single_file_path = f'{path}joined_output.wav'
if os.path.exists(single_file_path):
aseg = AudioSegment.from_wav(single_file_path)
if aseg.frame_rate == target_sample_rate:
aseg = aseg.set_frame_rate(target_sample_rate)
aseg.export(single_file_path, format='wav')
print('get_speaker_audio_file Cached Duration:', aseg.duration_seconds)
return single_file_path, aseg.duration_seconds
if aseg.frame_rate == target_sample_rate: # sample sample rate
print('get_speaker_audio_file Cached Duration:', aseg.duration_seconds)
return single_file_path, aseg.duration_seconds

single_file_path = get_single_file(path, target_sample_rate)
if not single_file_path:
return None, 0 # no files for this codec

aseg = AudioSegment.from_wav(single_file_path)
print('get_speaker_audio_file Initial Duration:', aseg.duration_seconds, 'Sample rate:', aseg.frame_rate / 1000)
output = AudioSegment.empty()
Expand All @@ -137,14 +140,14 @@ def get_speaker_audio_file(uid: str, target_sample_rate: int = 8000) -> Tuple[Op
end = segment['end'] * 1000
output += aseg[start:end]

if output.duration_seconds < 20:
seconds = 30
if output.duration_seconds < seconds:
print('get_speaker_audio_file Output Duration:', output.duration_seconds)
return single_file_path, output.duration_seconds
return single_file_path, output.duration_seconds # < 30

seconds = 20
output = output[:20 * 1000]
output = output[:seconds * 1000]
output.export(single_file_path, format="wav")
return single_file_path, seconds
return single_file_path, output.duration_seconds # 30


deepgram = DeepgramClient(os.getenv('DEEPGRAM_API_KEY'), DeepgramClientOptions(options={"keepalive": "true"}))
Expand All @@ -156,7 +159,7 @@ async def process_audio_dg(
loop = asyncio.get_event_loop()

def on_message(self, result, **kwargs):
print("Received message from Deepgram") # Log when message is received
print(f"Received message from Deepgram") # Log when message is received
sentence = result.channel.alternatives[0].transcript
if len(sentence) == 0:
return
Expand Down

0 comments on commit 7539e2e

Please sign in to comment.