diff --git a/.env.example b/.env.example index d498054..41a382a 100644 --- a/.env.example +++ b/.env.example @@ -10,7 +10,7 @@ LLM_SETTINGS__LOCAL_LLM_URL="http://localhost:1234/v1/chat/completions" LLM_SETTINGS__LOCAL_LLM_MODEL="gpt-oss" # Number of non-speech chunks to wait before speech ended. A chunk is approximately 31 ms. Increasing this number allows longer pauses in speech, but also increases response time. -BEHAVIOUR_SETTINGS__VAD_NON_SPEECH_PATIENCE_CHUNKS=3 +BEHAVIOUR_SETTINGS__VAD_NON_SPEECH_PATIENCE_CHUNKS=15 # Timeout in milliseconds for socket polling. Increase this number if network latency/jitter is high, often the case when using Wi-Fi. Perhaps 500 ms. A symptom of this issue is transcriptions getting cut off. BEHAVIOUR_SETTINGS__SOCKET_POLLER_TIMEOUT_MS=100 diff --git a/src/control_backend/agents/perception/vad_agent.py b/src/control_backend/agents/perception/vad_agent.py index 70fa9e1..e47b27a 100644 --- a/src/control_backend/agents/perception/vad_agent.py +++ b/src/control_backend/agents/perception/vad_agent.py @@ -229,10 +229,11 @@ class VADAgent(BaseAgent): assert self.model is not None prob = self.model(torch.from_numpy(chunk), settings.vad_settings.sample_rate_hz).item() non_speech_patience = settings.behaviour_settings.vad_non_speech_patience_chunks + begin_silence_length = settings.behaviour_settings.vad_begin_silence_chunks prob_threshold = settings.behaviour_settings.vad_prob_threshold if prob > prob_threshold: - if self.i_since_speech > non_speech_patience: + if self.i_since_speech > non_speech_patience + begin_silence_length: self.logger.debug("Speech started.") self.audio_buffer = np.append(self.audio_buffer, chunk) self.i_since_speech = 0 @@ -246,11 +247,12 @@ class VADAgent(BaseAgent): continue # Speech probably ended. Make sure we have a usable amount of data. - if len(self.audio_buffer) >= 3 * len(chunk): + if len(self.audio_buffer) > begin_silence_length * len(chunk): self.logger.debug("Speech ended.") assert self.audio_out_socket is not None await self.audio_out_socket.send(self.audio_buffer[: -2 * len(chunk)].tobytes()) - # At this point, we know that the speech has ended. - # Prepend the last chunk that had no speech, for a more fluent boundary - self.audio_buffer = chunk + # At this point, we know that there is no speech. + # Prepend the last few chunks that had no speech, for a more fluent boundary. + self.audio_buffer = np.append(self.audio_buffer, chunk) + self.audio_buffer = self.audio_buffer[-begin_silence_length * len(chunk) :] diff --git a/src/control_backend/core/config.py b/src/control_backend/core/config.py index 2ed5c04..02018ee 100644 --- a/src/control_backend/core/config.py +++ b/src/control_backend/core/config.py @@ -73,6 +73,7 @@ class BehaviourSettings(BaseModel): :ivar vad_prob_threshold: Probability threshold for Voice Activity Detection. :ivar vad_initial_since_speech: Initial value for 'since speech' counter in VAD. :ivar vad_non_speech_patience_chunks: Number of non-speech chunks to wait before speech ended. + :ivar vad_begin_silence_chunks: The number of chunks of silence to prepend to speech chunks. :ivar transcription_max_concurrent_tasks: Maximum number of concurrent transcription tasks. :ivar transcription_words_per_minute: Estimated words per minute for transcription timing. :ivar transcription_words_per_token: Estimated words per token for transcription timing. @@ -90,6 +91,7 @@ class BehaviourSettings(BaseModel): vad_prob_threshold: float = 0.5 vad_initial_since_speech: int = 100 vad_non_speech_patience_chunks: int = 15 + vad_begin_silence_chunks: int = 3 # transcription behaviour transcription_max_concurrent_tasks: int = 3