fix: transcribe remaining buffer when network drops

2026-02-09 15:59:48 +01:00
parent 4855bde1a4
commit df489f4ac3
2 changed files with 16 additions and 8 deletions
--- a/.env.example
+++ b/.env.example
@@ -12,8 +12,8 @@ LLM_SETTINGS__LOCAL_LLM_MODEL="gpt-oss"
 # Number of non-speech chunks to wait before speech ended. A chunk is approximately 31 ms. Increasing this number allows longer pauses in speech, but also increases response time.
 BEHAVIOUR_SETTINGS__VAD_NON_SPEECH_PATIENCE_CHUNKS=15

-# Timeout in milliseconds for socket polling. Increase this number if network latency/jitter is high, often the case when using Wi-Fi. Perhaps 500 ms. A symptom of this issue is transcriptions getting cut off.
-BEHAVIOUR_SETTINGS__SOCKET_POLLER_TIMEOUT_MS=100
+# Timeout in milliseconds for socket polling. Increase this number if network latency/jitter is high, often the case when using Wi-Fi. Perhaps 500 ms or more. A symptom of this issue is transcriptions getting cut off.
+BEHAVIOUR_SETTINGS__SOCKET_POLLER_TIMEOUT_MS=400



--- a/src/control_backend/agents/perception/vad_agent.py
+++ b/src/control_backend/agents/perception/vad_agent.py
@@ -241,9 +241,20 @@ class VADAgent(BaseAgent):
                self._reset_needed = False

            assert self.audio_in_poller is not None
+
+            non_speech_patience = settings.behaviour_settings.vad_non_speech_patience_chunks
+            begin_silence_length = settings.behaviour_settings.vad_begin_silence_chunks
+            prob_threshold = settings.behaviour_settings.vad_prob_threshold
+
            data = await self.audio_in_poller.poll()
            if data is None:
                if len(self.audio_buffer) > 0:
+                    # Failed to receive new audio. Send remaining buffer to be transcribed.
+                    if len(self.audio_buffer) > begin_silence_length * 512:
+                        self.logger.debug("Speech ended.")
+                        assert self.audio_out_socket is not None
+                        await self.audio_out_socket.send(self.audio_buffer[: -2 * 512].tobytes())
+                    else:
                        self.logger.debug(
                            "No audio data received. Discarding buffer until new data arrives."
                        )
@@ -255,9 +266,6 @@ class VADAgent(BaseAgent):
            chunk = np.frombuffer(data, dtype=np.float32).copy()
            assert self.model is not None
            prob = self.model(torch.from_numpy(chunk), settings.vad_settings.sample_rate_hz).item()
-            non_speech_patience = settings.behaviour_settings.vad_non_speech_patience_chunks
-            begin_silence_length = settings.behaviour_settings.vad_begin_silence_chunks
-            prob_threshold = settings.behaviour_settings.vad_prob_threshold

            if prob > prob_threshold:
                if self.i_since_speech > non_speech_patience + begin_silence_length: