From 941aa00b7be48dfdba8a44bff7ba6c362de461ef Mon Sep 17 00:00:00 2001 From: Twirre Meulenbelt <43213592+TwirreM@users.noreply.github.com> Date: Tue, 27 Jan 2026 18:19:20 +0100 Subject: [PATCH] chore: re-addd more silence before speech audio --- .../agents/perception/vad_agent.py | 7 +++--- .../vad_agent/test_vad_streaming.py | 22 ++++++++++++++----- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/control_backend/agents/perception/vad_agent.py b/src/control_backend/agents/perception/vad_agent.py index 920c3ab..f397563 100644 --- a/src/control_backend/agents/perception/vad_agent.py +++ b/src/control_backend/agents/perception/vad_agent.py @@ -285,9 +285,10 @@ class VADAgent(BaseAgent): assert self.audio_out_socket is not None await self.audio_out_socket.send(self.audio_buffer[: -2 * len(chunk)].tobytes()) - # At this point, we know that the speech has ended. - # Prepend the last chunk that had no speech, for a more fluent boundary - self.audio_buffer = chunk + # At this point, we know that there is no speech. + # Prepend the last few chunks that had no speech, for a more fluent boundary. + self.audio_buffer = np.append(self.audio_buffer, chunk) + self.audio_buffer = self.audio_buffer[-begin_silence_length * len(chunk) :] async def handle_message(self, msg: InternalMessage): """ diff --git a/test/unit/agents/perception/vad_agent/test_vad_streaming.py b/test/unit/agents/perception/vad_agent/test_vad_streaming.py index 349fab2..b53f63d 100644 --- a/test/unit/agents/perception/vad_agent/test_vad_streaming.py +++ b/test/unit/agents/perception/vad_agent/test_vad_streaming.py @@ -24,7 +24,9 @@ def audio_out_socket(): @pytest.fixture def vad_agent(audio_out_socket): - return VADAgent("tcp://localhost:5555", False) + agent = VADAgent("tcp://localhost:5555", False) + agent._internal_pub_socket = AsyncMock() + return agent @pytest.fixture(autouse=True) @@ -44,6 +46,12 @@ def patch_settings(monkeypatch): monkeypatch.setattr(vad_agent.settings.vad_settings, "sample_rate_hz", 16_000, raising=False) +@pytest.fixture(autouse=True) +def mock_experiment_logger(): + with patch("control_backend.agents.perception.vad_agent.experiment_logger") as logger: + yield logger + + async def simulate_streaming_with_probabilities(streaming, probabilities: list[float]): """ Simulates a streaming scenario with given VAD model probabilities for testing purposes. @@ -84,14 +92,15 @@ async def test_voice_activity_detected(audio_out_socket, vad_agent): Test a scenario where there is voice activity detected between silences. """ speech_chunk_count = 5 - probabilities = [0.0] * 5 + [1.0] * speech_chunk_count + [0.0] * 5 + begin_silence_chunks = settings.behaviour_settings.vad_begin_silence_chunks + probabilities = [0.0] * 15 + [1.0] * speech_chunk_count + [0.0] * 5 vad_agent.audio_out_socket = audio_out_socket await simulate_streaming_with_probabilities(vad_agent, probabilities) audio_out_socket.send.assert_called_once() data = audio_out_socket.send.call_args[0][0] assert isinstance(data, bytes) - assert len(data) == 512 * 4 * (speech_chunk_count + 1) + assert len(data) == 512 * 4 * (begin_silence_chunks + speech_chunk_count) @pytest.mark.asyncio @@ -101,8 +110,9 @@ async def test_voice_activity_short_pause(audio_out_socket, vad_agent): short pause. """ speech_chunk_count = 5 + begin_silence_chunks = settings.behaviour_settings.vad_begin_silence_chunks probabilities = ( - [0.0] * 5 + [1.0] * speech_chunk_count + [0.0] + [1.0] * speech_chunk_count + [0.0] * 5 + [0.0] * 15 + [1.0] * speech_chunk_count + [0.0] + [1.0] * speech_chunk_count + [0.0] * 5 ) vad_agent.audio_out_socket = audio_out_socket await simulate_streaming_with_probabilities(vad_agent, probabilities) @@ -110,8 +120,8 @@ async def test_voice_activity_short_pause(audio_out_socket, vad_agent): audio_out_socket.send.assert_called_once() data = audio_out_socket.send.call_args[0][0] assert isinstance(data, bytes) - # Expecting 13 chunks (2*5 with speech, 1 pause between, 1 as padding) - assert len(data) == 512 * 4 * (speech_chunk_count * 2 + 1 + 1) + # Expecting 13 chunks (2*5 with speech, 1 pause between, begin_silence_chunks as padding) + assert len(data) == 512 * 4 * (speech_chunk_count * 2 + 1 + begin_silence_chunks) @pytest.mark.asyncio