fix some function to match working one

BasedHardware · Aug 17, 2024 · 70d1767 · 70d1767
1 parent 91c8d5d
commit 70d1767
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 16 deletions.
diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py
@@ -13,7 +13,7 @@
 
 from utils.redis_utils import get_user_speech_profile, get_user_speech_profile_duration
 from utils.stt.deepgram_util import process_audio_dg, send_initial_file2, transcribe_file_deepgram
-from utils.stt.vad import VADIterator, model, get_speech_state, SpeechState, vad_is_empty, is_speech_present
+from utils.stt.vad import VADIterator, model, vad_is_empty, is_speech_present
 
 router = APIRouter()
 
@@ -54,7 +54,7 @@ async def _websocket_util(
     websocket_active = True
     duration = 0
     is_speech_active = False
-    speech_timeout = 0.7 # Good for now but better dynamically adjust it by user behaviour
+    speech_timeout = 0.7 # Good for now but better dynamically adjust it by user behaviour, just idea: Increase as active time passes but until certain threshold, but not needed yet.
     last_speech_time = 0
     try:
         if language == 'en' and codec == 'opus' and include_speech_profile:
@@ -89,26 +89,20 @@ async def receive_audio(socket1, socket2):
         databuffer = bytearray(b"")
 
         REALTIME_RESOLUTION = 0.01
-        sample_width = 2  # pcm here is 16 bit
+        sample_width = 2  # pcm8/16 here is 16 bit
         byte_rate = sample_width * sample_rate * channels
         chunk_size = int(byte_rate * REALTIME_RESOLUTION)
 
         timer_start = time.time()
-        speech_state = SpeechState.no_speech
-        voice_found, not_voice = 0, 0
-        # path = 'scripts/vad/audio_bytes.txt'
-        # if os.path.exists(path):
-        #     os.remove(path)
-        # audio_file = open(path, "a")
         try:
-            sample_width = 1 if codec == "pcm8" else 2
             while websocket_active:
                 data = await websocket.receive_bytes()
                 if codec == 'opus':
                     decoded_opus = decoder.decode(data, frame_size=320)
                     samples = torch.frombuffer(decoded_opus, dtype=torch.int16).float() / 32768.0
-                elif codec in ['pcm8', 'pcm16']:  # Both are now 16-bit
-                    samples = torch.frombuffer(data, dtype=torch.int16).float() / 32768.0
+                elif codec in ['pcm8', 'pcm16']:  # Both are 16 bit
+                    writable_data = bytearray(data)
+                    samples = torch.frombuffer(writable_data, dtype=torch.int16).float() / 32768.0
                 else:
                     raise ValueError(f"Unsupported codec: {codec}")
 
@@ -144,8 +138,6 @@ async def receive_audio(socket1, socket2):
                 else:
                     socket2.send(audio_buffer)
 
-                audio_buffer = bytearray()
-
         except WebSocketDisconnect:
             print("WebSocket disconnected")
         except Exception as e:

diff --git a/backend/utils/stt/vad.py b/backend/utils/stt/vad.py
@@ -19,8 +19,7 @@
 (get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
 
 
-def get_speech_state(data, vad_iterator, window_size_samples=256):
-    has_start, has_end = False, False
+def is_speech_present(data, vad_iterator, window_size_samples=256):
     for i in range(0, len(data), window_size_samples):
         chunk = data[i: i + window_size_samples]
         if len(chunk) < window_size_samples: