fix: transcribe remaining buffer when network drops

2026-02-09 15:59:48 +01:00
parent 4855bde1a4
commit df489f4ac3
2 changed files with 16 additions and 8 deletions
--- a/.env.example
+++ b/.env.example
@@ -12,8 +12,8 @@ LLM_SETTINGS__LOCAL_LLM_MODEL="gpt-oss"
 # Number of non-speech chunks to wait before speech ended. A chunk is approximately 31 ms. Increasing this number allows longer pauses in speech, but also increases response time.
 BEHAVIOUR_SETTINGS__VAD_NON_SPEECH_PATIENCE_CHUNKS=15
-# Timeout in milliseconds for socket polling. Increase this number if network latency/jitter is high, often the case when using Wi-Fi. Perhaps 500 ms. A symptom of this issue is transcriptions getting cut off.
+# Timeout in milliseconds for socket polling. Increase this number if network latency/jitter is high, often the case when using Wi-Fi. Perhaps 500 ms or more. A symptom of this issue is transcriptions getting cut off.
-BEHAVIOUR_SETTINGS__SOCKET_POLLER_TIMEOUT_MS=100
+BEHAVIOUR_SETTINGS__SOCKET_POLLER_TIMEOUT_MS=400
--- a/src/control_backend/agents/perception/vad_agent.py
+++ b/src/control_backend/agents/perception/vad_agent.py
@@ -241,12 +241,23 @@ class VADAgent(BaseAgent):
                self._reset_needed = False
            assert self.audio_in_poller is not None
            non_speech_patience = settings.behaviour_settings.vad_non_speech_patience_chunks
            begin_silence_length = settings.behaviour_settings.vad_begin_silence_chunks
            prob_threshold = settings.behaviour_settings.vad_prob_threshold
            data = await self.audio_in_poller.poll()
            if data is None:
                if len(self.audio_buffer) > 0:
-                    self.logger.debug(
+                    # Failed to receive new audio. Send remaining buffer to be transcribed.
-                        "No audio data received. Discarding buffer until new data arrives."
+                    if len(self.audio_buffer) > begin_silence_length * 512:
-                    )
+                        self.logger.debug("Speech ended.")
                        assert self.audio_out_socket is not None
                        await self.audio_out_socket.send(self.audio_buffer[: -2 * 512].tobytes())
                    else:
                        self.logger.debug(
                            "No audio data received. Discarding buffer until new data arrives."
                        )
                    self.audio_buffer = np.array([], dtype=np.float32)
                    self.i_since_speech = settings.behaviour_settings.vad_initial_since_speech
                continue
@@ -255,9 +266,6 @@ class VADAgent(BaseAgent):
            chunk = np.frombuffer(data, dtype=np.float32).copy()
            assert self.model is not None
            prob = self.model(torch.from_numpy(chunk), settings.vad_settings.sample_rate_hz).item()
            non_speech_patience = settings.behaviour_settings.vad_non_speech_patience_chunks
            begin_silence_length = settings.behaviour_settings.vad_begin_silence_chunks
            prob_threshold = settings.behaviour_settings.vad_prob_threshold
            if prob > prob_threshold:
                if self.i_since_speech > non_speech_patience + begin_silence_length: