diff --git a/.env.example b/.env.example index 41a382a..a4ae63f 100644 --- a/.env.example +++ b/.env.example @@ -12,8 +12,8 @@ LLM_SETTINGS__LOCAL_LLM_MODEL="gpt-oss" # Number of non-speech chunks to wait before speech ended. A chunk is approximately 31 ms. Increasing this number allows longer pauses in speech, but also increases response time. BEHAVIOUR_SETTINGS__VAD_NON_SPEECH_PATIENCE_CHUNKS=15 -# Timeout in milliseconds for socket polling. Increase this number if network latency/jitter is high, often the case when using Wi-Fi. Perhaps 500 ms. A symptom of this issue is transcriptions getting cut off. -BEHAVIOUR_SETTINGS__SOCKET_POLLER_TIMEOUT_MS=100 +# Timeout in milliseconds for socket polling. Increase this number if network latency/jitter is high, often the case when using Wi-Fi. Perhaps 500 ms or more. A symptom of this issue is transcriptions getting cut off. +BEHAVIOUR_SETTINGS__SOCKET_POLLER_TIMEOUT_MS=400 diff --git a/src/control_backend/agents/perception/vad_agent.py b/src/control_backend/agents/perception/vad_agent.py index a6711dd..42fae0f 100644 --- a/src/control_backend/agents/perception/vad_agent.py +++ b/src/control_backend/agents/perception/vad_agent.py @@ -241,12 +241,23 @@ class VADAgent(BaseAgent): self._reset_needed = False assert self.audio_in_poller is not None + + non_speech_patience = settings.behaviour_settings.vad_non_speech_patience_chunks + begin_silence_length = settings.behaviour_settings.vad_begin_silence_chunks + prob_threshold = settings.behaviour_settings.vad_prob_threshold + data = await self.audio_in_poller.poll() if data is None: if len(self.audio_buffer) > 0: - self.logger.debug( - "No audio data received. Discarding buffer until new data arrives." - ) + # Failed to receive new audio. Send remaining buffer to be transcribed. + if len(self.audio_buffer) > begin_silence_length * 512: + self.logger.debug("Speech ended.") + assert self.audio_out_socket is not None + await self.audio_out_socket.send(self.audio_buffer[: -2 * 512].tobytes()) + else: + self.logger.debug( + "No audio data received. Discarding buffer until new data arrives." + ) self.audio_buffer = np.array([], dtype=np.float32) self.i_since_speech = settings.behaviour_settings.vad_initial_since_speech continue @@ -255,9 +266,6 @@ class VADAgent(BaseAgent): chunk = np.frombuffer(data, dtype=np.float32).copy() assert self.model is not None prob = self.model(torch.from_numpy(chunk), settings.vad_settings.sample_rate_hz).item() - non_speech_patience = settings.behaviour_settings.vad_non_speech_patience_chunks - begin_silence_length = settings.behaviour_settings.vad_begin_silence_chunks - prob_threshold = settings.behaviour_settings.vad_prob_threshold if prob > prob_threshold: if self.i_since_speech > non_speech_patience + begin_silence_length: