fix: move VAD agent creation to RI communication agent

Previously, it was started in main, but it should use values negotiated by the RI communication agent. ref: N25B-356
2025-12-03 15:07:29 +01:00
parent c85753f834
commit 21e9d05d6e
6 changed files with 125 additions and 27 deletions
--- a/src/control_backend/agents/perception/vad_agent.py
+++ b/src/control_backend/agents/perception/vad_agent.py
@@ -8,6 +8,7 @@ import zmq.asyncio as azmq
 from control_backend.agents import BaseAgent
 from control_backend.core.config import settings

+from ...schemas.program_status import PROGRAM_STATUS, ProgramStatus
 from .transcription_agent.transcription_agent import TranscriptionAgent


@@ -61,6 +62,7 @@ class VADAgent(BaseAgent):
    :ivar audio_in_address: Address of the input audio stream.
    :ivar audio_in_bind: Whether to bind or connect to the input address.
    :ivar audio_out_socket: ZMQ PUB socket for sending speech fragments.
+    :ivar program_sub_socket: ZMQ SUB socket for receiving program status updates.
    """

    def __init__(self, audio_in_address: str, audio_in_bind: bool):
@@ -79,6 +81,8 @@ class VADAgent(BaseAgent):
        self.audio_out_socket: azmq.Socket | None = None
        self.audio_in_poller: SocketPoller | None = None

+        self.program_sub_socket: azmq.Socket | None = None
+
        self.audio_buffer = np.array([], dtype=np.float32)
        self.i_since_speech = settings.behaviour_settings.vad_initial_since_speech
        self._ready = asyncio.Event()
@@ -90,9 +94,10 @@ class VADAgent(BaseAgent):

        1. Connects audio input socket.
        2. Binds audio output socket (random port).
-        3. Loads VAD model from Torch Hub.
-        4. Starts the streaming loop.
-        5. Instantiates and starts the :class:`TranscriptionAgent` with the output address.
+        3. Connects to program communication socket.
+        4. Loads VAD model from Torch Hub.
+        5. Starts the streaming loop.
+        6. Instantiates and starts the :class:`TranscriptionAgent` with the output address.
        """
        self.logger.info("Setting up %s", self.name)

@@ -105,6 +110,11 @@ class VADAgent(BaseAgent):
            return
        audio_out_address = f"tcp://localhost:{audio_out_port}"

+        # Connect to internal communication socket
+        self.program_sub_socket = azmq.Context.instance().socket(zmq.SUB)
+        self.program_sub_socket.connect(settings.zmq_settings.internal_sub_address)
+        self.program_sub_socket.subscribe(PROGRAM_STATUS)
+
        # Initialize VAD model
        try:
            self.model, _ = torch.hub.load(
@@ -117,10 +127,8 @@ class VADAgent(BaseAgent):
            await self.stop()
            return

-        # Warmup/reset
-        await self.reset_stream()
-
        self.add_behavior(self._streaming_loop())
+        self.add_behavior(self._status_loop())

        # Start agents dependent on the output audio fragments here
        transcriber = TranscriptionAgent(audio_out_address)
@@ -165,7 +173,7 @@ class VADAgent(BaseAgent):
            self.audio_out_socket = None
            return None

-    async def reset_stream(self):
+    async def _reset_stream(self):
        """
        Clears the ZeroMQ queue and sets ready state.
        """
@@ -176,6 +184,23 @@ class VADAgent(BaseAgent):
        self.logger.info(f"Discarded {discarded} audio packets before starting.")
        self._ready.set()

+    async def _status_loop(self):
+        """Loop for checking program status. Only start listening if program is RUNNING."""
+        while self._running:
+            topic, body = await self.program_sub_socket.recv_multipart()
+
+            if topic != PROGRAM_STATUS:
+                continue
+            if body != ProgramStatus.RUNNING.value:
+                continue
+
+            # Program is now running, we can start our stream
+            await self._reset_stream()
+
+            # We don't care about further status updates
+            self.program_sub_socket.close()
+            break
+
    async def _streaming_loop(self):
        """
        Main loop for processing audio stream.