Skip to content

Commit

Permalink
fix some function to match working one
Browse files Browse the repository at this point in the history
  • Loading branch information
0xzre committed Aug 17, 2024
1 parent 91c8d5d commit 70d1767
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 16 deletions.
20 changes: 6 additions & 14 deletions backend/routers/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from utils.redis_utils import get_user_speech_profile, get_user_speech_profile_duration
from utils.stt.deepgram_util import process_audio_dg, send_initial_file2, transcribe_file_deepgram
from utils.stt.vad import VADIterator, model, get_speech_state, SpeechState, vad_is_empty, is_speech_present
from utils.stt.vad import VADIterator, model, vad_is_empty, is_speech_present

router = APIRouter()

Expand Down Expand Up @@ -54,7 +54,7 @@ async def _websocket_util(
websocket_active = True
duration = 0
is_speech_active = False
speech_timeout = 0.7 # Good for now but better dynamically adjust it by user behaviour
speech_timeout = 0.7 # Good for now but better dynamically adjust it by user behaviour, just idea: Increase as active time passes but until certain threshold, but not needed yet.
last_speech_time = 0
try:
if language == 'en' and codec == 'opus' and include_speech_profile:
Expand Down Expand Up @@ -89,26 +89,20 @@ async def receive_audio(socket1, socket2):
databuffer = bytearray(b"")

REALTIME_RESOLUTION = 0.01
sample_width = 2 # pcm here is 16 bit
sample_width = 2 # pcm8/16 here is 16 bit
byte_rate = sample_width * sample_rate * channels
chunk_size = int(byte_rate * REALTIME_RESOLUTION)

timer_start = time.time()
speech_state = SpeechState.no_speech
voice_found, not_voice = 0, 0
# path = 'scripts/vad/audio_bytes.txt'
# if os.path.exists(path):
# os.remove(path)
# audio_file = open(path, "a")
try:
sample_width = 1 if codec == "pcm8" else 2
while websocket_active:
data = await websocket.receive_bytes()
if codec == 'opus':
decoded_opus = decoder.decode(data, frame_size=320)
samples = torch.frombuffer(decoded_opus, dtype=torch.int16).float() / 32768.0
elif codec in ['pcm8', 'pcm16']: # Both are now 16-bit
samples = torch.frombuffer(data, dtype=torch.int16).float() / 32768.0
elif codec in ['pcm8', 'pcm16']: # Both are 16 bit
writable_data = bytearray(data)
samples = torch.frombuffer(writable_data, dtype=torch.int16).float() / 32768.0
else:
raise ValueError(f"Unsupported codec: {codec}")

Expand Down Expand Up @@ -144,8 +138,6 @@ async def receive_audio(socket1, socket2):
else:
socket2.send(audio_buffer)

audio_buffer = bytearray()

except WebSocketDisconnect:
print("WebSocket disconnected")
except Exception as e:
Expand Down
3 changes: 1 addition & 2 deletions backend/utils/stt/vad.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils


def get_speech_state(data, vad_iterator, window_size_samples=256):
has_start, has_end = False, False
def is_speech_present(data, vad_iterator, window_size_samples=256):
for i in range(0, len(data), window_size_samples):
chunk = data[i: i + window_size_samples]
if len(chunk) < window_size_samples:
Expand Down

0 comments on commit 70d1767

Please sign in to comment.