chore: re-addd more silence before speech audio
This commit is contained in:
@@ -285,9 +285,10 @@ class VADAgent(BaseAgent):
|
|||||||
assert self.audio_out_socket is not None
|
assert self.audio_out_socket is not None
|
||||||
await self.audio_out_socket.send(self.audio_buffer[: -2 * len(chunk)].tobytes())
|
await self.audio_out_socket.send(self.audio_buffer[: -2 * len(chunk)].tobytes())
|
||||||
|
|
||||||
# At this point, we know that the speech has ended.
|
# At this point, we know that there is no speech.
|
||||||
# Prepend the last chunk that had no speech, for a more fluent boundary
|
# Prepend the last few chunks that had no speech, for a more fluent boundary.
|
||||||
self.audio_buffer = chunk
|
self.audio_buffer = np.append(self.audio_buffer, chunk)
|
||||||
|
self.audio_buffer = self.audio_buffer[-begin_silence_length * len(chunk) :]
|
||||||
|
|
||||||
async def handle_message(self, msg: InternalMessage):
|
async def handle_message(self, msg: InternalMessage):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -24,7 +24,9 @@ def audio_out_socket():
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def vad_agent(audio_out_socket):
|
def vad_agent(audio_out_socket):
|
||||||
return VADAgent("tcp://localhost:5555", False)
|
agent = VADAgent("tcp://localhost:5555", False)
|
||||||
|
agent._internal_pub_socket = AsyncMock()
|
||||||
|
return agent
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
@@ -44,6 +46,12 @@ def patch_settings(monkeypatch):
|
|||||||
monkeypatch.setattr(vad_agent.settings.vad_settings, "sample_rate_hz", 16_000, raising=False)
|
monkeypatch.setattr(vad_agent.settings.vad_settings, "sample_rate_hz", 16_000, raising=False)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def mock_experiment_logger():
|
||||||
|
with patch("control_backend.agents.perception.vad_agent.experiment_logger") as logger:
|
||||||
|
yield logger
|
||||||
|
|
||||||
|
|
||||||
async def simulate_streaming_with_probabilities(streaming, probabilities: list[float]):
|
async def simulate_streaming_with_probabilities(streaming, probabilities: list[float]):
|
||||||
"""
|
"""
|
||||||
Simulates a streaming scenario with given VAD model probabilities for testing purposes.
|
Simulates a streaming scenario with given VAD model probabilities for testing purposes.
|
||||||
@@ -84,14 +92,15 @@ async def test_voice_activity_detected(audio_out_socket, vad_agent):
|
|||||||
Test a scenario where there is voice activity detected between silences.
|
Test a scenario where there is voice activity detected between silences.
|
||||||
"""
|
"""
|
||||||
speech_chunk_count = 5
|
speech_chunk_count = 5
|
||||||
probabilities = [0.0] * 5 + [1.0] * speech_chunk_count + [0.0] * 5
|
begin_silence_chunks = settings.behaviour_settings.vad_begin_silence_chunks
|
||||||
|
probabilities = [0.0] * 15 + [1.0] * speech_chunk_count + [0.0] * 5
|
||||||
vad_agent.audio_out_socket = audio_out_socket
|
vad_agent.audio_out_socket = audio_out_socket
|
||||||
await simulate_streaming_with_probabilities(vad_agent, probabilities)
|
await simulate_streaming_with_probabilities(vad_agent, probabilities)
|
||||||
|
|
||||||
audio_out_socket.send.assert_called_once()
|
audio_out_socket.send.assert_called_once()
|
||||||
data = audio_out_socket.send.call_args[0][0]
|
data = audio_out_socket.send.call_args[0][0]
|
||||||
assert isinstance(data, bytes)
|
assert isinstance(data, bytes)
|
||||||
assert len(data) == 512 * 4 * (speech_chunk_count + 1)
|
assert len(data) == 512 * 4 * (begin_silence_chunks + speech_chunk_count)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@@ -101,8 +110,9 @@ async def test_voice_activity_short_pause(audio_out_socket, vad_agent):
|
|||||||
short pause.
|
short pause.
|
||||||
"""
|
"""
|
||||||
speech_chunk_count = 5
|
speech_chunk_count = 5
|
||||||
|
begin_silence_chunks = settings.behaviour_settings.vad_begin_silence_chunks
|
||||||
probabilities = (
|
probabilities = (
|
||||||
[0.0] * 5 + [1.0] * speech_chunk_count + [0.0] + [1.0] * speech_chunk_count + [0.0] * 5
|
[0.0] * 15 + [1.0] * speech_chunk_count + [0.0] + [1.0] * speech_chunk_count + [0.0] * 5
|
||||||
)
|
)
|
||||||
vad_agent.audio_out_socket = audio_out_socket
|
vad_agent.audio_out_socket = audio_out_socket
|
||||||
await simulate_streaming_with_probabilities(vad_agent, probabilities)
|
await simulate_streaming_with_probabilities(vad_agent, probabilities)
|
||||||
@@ -110,8 +120,8 @@ async def test_voice_activity_short_pause(audio_out_socket, vad_agent):
|
|||||||
audio_out_socket.send.assert_called_once()
|
audio_out_socket.send.assert_called_once()
|
||||||
data = audio_out_socket.send.call_args[0][0]
|
data = audio_out_socket.send.call_args[0][0]
|
||||||
assert isinstance(data, bytes)
|
assert isinstance(data, bytes)
|
||||||
# Expecting 13 chunks (2*5 with speech, 1 pause between, 1 as padding)
|
# Expecting 13 chunks (2*5 with speech, 1 pause between, begin_silence_chunks as padding)
|
||||||
assert len(data) == 512 * 4 * (speech_chunk_count * 2 + 1 + 1)
|
assert len(data) == 512 * 4 * (speech_chunk_count * 2 + 1 + begin_silence_chunks)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|||||||
Reference in New Issue
Block a user