Merge branch 'dev' into refactor/logging

2025-11-05 15:09:14 +01:00
parent 220c5c7739 4d38850a1d
commit d9fef22090
11 changed files with 153 additions and 51 deletions
--- a/.githooks/check-branch-name.sh
+++ b/.githooks/check-branch-name.sh
@@ -10,7 +10,7 @@
 # An array of allowed commit types
 ALLOWED_TYPES=(feat fix refactor perf style test docs build chore revert)
 # An array of branches to ignore
-IGNORED_BRANCHES=(main dev)
+IGNORED_BRANCHES=(main dev demo)

 # --- Colors for Output ---
 RED='\033[0;31m'
--- a/src/control_backend/agents/bdi/behaviours/receive_llm_resp_behaviour.py
+++ b/src/control_backend/agents/bdi/behaviours/receive_llm_resp_behaviour.py
@@ -1,6 +1,8 @@
 from spade.behaviour import CyclicBehaviour
+from spade.message import Message

 from control_backend.core.config import settings
+from control_backend.schemas.ri_message import SpeechCommand


 class ReceiveLLMResponseBehaviour(CyclicBehaviour):
@@ -16,7 +18,20 @@ class ReceiveLLMResponseBehaviour(CyclicBehaviour):
            case settings.agent_settings.llm_agent_name:
                content = msg.body
                self.agent.logger.info("Received LLM response: %s", content)
-                # Here the BDI can pass the message back as a response
+
+                speech_command = SpeechCommand(data=content)
+
+                message = Message(
+                    to=settings.agent_settings.ri_command_agent_name
+                    + "@"
+                    + settings.agent_settings.host,
+                    sender=self.agent.jid,
+                    body=speech_command.model_dump_json(),
+                )
+
+                self.agent.logger.debug("Sending message: %s", message)
+
+                await self.send(message)
            case _:
                self.agent.logger.debug("Discarding message from %s", sender)
                pass
--- a/src/control_backend/agents/llm/llm.py
+++ b/src/control_backend/agents/llm/llm.py
@@ -1,4 +1,6 @@
-from typing import Any
+import json
+import re
+from collections.abc import AsyncGenerator

 import httpx
 from spade.behaviour import CyclicBehaviour
@@ -45,11 +47,16 @@ class LLMAgent(BaseAgent):

        async def _process_bdi_message(self, message: Message):
            """
-            Forwards user text to the LLM and replies with the generated text.
+            Forwards user text from the BDI to the LLM and replies with the generated text in chunks
+            separated by punctuation.
            """
            user_text = message.body
-            llm_response = await self._query_llm(user_text)
-            await self._reply(llm_response)
+            # Consume the streaming generator and send a reply for every chunk
+            async for chunk in self._query_llm(user_text):
+                await self._reply(chunk)
+            self.agent.logger.debug(
+                "Finished processing BDI message. Response sent in chunks to BDI Core Agent."
+            )

        async def _reply(self, msg: str):
            """
@@ -60,48 +67,89 @@ class LLMAgent(BaseAgent):
                body=msg,
            )
            await self.send(reply)
-            self.agent.logger.info("Reply sent to BDI Core Agent")

-        async def _query_llm(self, prompt: str) -> str:
+        async def _query_llm(self, prompt: str) -> AsyncGenerator[str]:
            """
-            Sends a chat completion request to the local LLM service.
+            Sends a chat completion request to the local LLM service and streams the response by
+            yielding fragments separated by punctuation like.

            :param prompt: Input text prompt to pass to the LLM.
-            :return: LLM-generated content or fallback message.
+            :yield: Fragments of the LLM-generated content.
            """
-            async with httpx.AsyncClient(timeout=120.0) as client:
-                # Example dynamic content for future (optional)
+            instructions = LLMInstructions(
+                "- Be friendly and respectful.\n"
+                "- Make the conversation feel natural and engaging.\n"
+                "- Speak like a pirate.\n"
+                "- When the user asks what you can do, tell them.",
+                "- Try to learn the user's name during conversation.\n"
+                "- Suggest playing a game of asking yes or no questions where you think of a word "
+                "and the user must guess it.",
+            )
+            messages = [
+                {
+                    "role": "developer",
+                    "content": instructions.build_developer_instruction(),
+                },
+                {
+                    "role": "user",
+                    "content": prompt,
+                },
+            ]

-                instructions = LLMInstructions()
-                developer_instruction = instructions.build_developer_instruction()
+            try:
+                current_chunk = ""
+                async for token in self._stream_query_llm(messages):
+                    current_chunk += token

-                response = await client.post(
+                    # Stream the message in chunks separated by punctuation.
+                    # We include the delimiter in the emitted chunk for natural flow.
+                    pattern = re.compile(r".*?(?:,|;|:|—|–|\.{3}|…|\.|\?|!)\s*", re.DOTALL)
+                    for m in pattern.finditer(current_chunk):
+                        chunk = m.group(0)
+                        if chunk:
+                            yield current_chunk
+                            current_chunk = ""
+
+                # Yield any remaining tail
+                if current_chunk:
+                    yield current_chunk
+            except httpx.HTTPError as err:
+                self.agent.logger.error("HTTP error.", exc_info=err)
+                yield "LLM service unavailable."
+            except Exception as err:
+                self.agent.logger.error("Unexpected error.", exc_info=err)
+                yield "Error processing the request."
+
+        async def _stream_query_llm(self, messages) -> AsyncGenerator[str]:
+            """Raises httpx.HTTPError when the API gives an error."""
+            async with httpx.AsyncClient(timeout=None) as client:
+                async with client.stream(
+                    "POST",
                    settings.llm_settings.local_llm_url,
-                    headers={"Content-Type": "application/json"},
                    json={
                        "model": settings.llm_settings.local_llm_model,
-                        "messages": [
-                            {"role": "developer", "content": developer_instruction},
-                            {"role": "user", "content": prompt},
-                        ],
+                        "messages": messages,
                        "temperature": 0.3,
+                        "stream": True,
                    },
-                )
-
-                try:
+                ) as response:
                    response.raise_for_status()
-                    data: dict[str, Any] = response.json()
-                    return (
-                        data.get("choices", [{}])[0]
-                        .get("message", {})
-                        .get("content", "No response")
-                    )
-                except httpx.HTTPError as err:
-                    self.agent.logger.error("HTTP error: %s", err)
-                    return "LLM service unavailable."
-                except Exception as err:
-                    self.agent.logger.error("Unexpected error: %s", err)
-                    return "Error processing the request."
+
+                    async for line in response.aiter_lines():
+                        if not line or not line.startswith("data: "):
+                            continue
+
+                        data = line[len("data: ") :]
+                        if data.strip() == "[DONE]":
+                            break
+
+                        try:
+                            event = json.loads(data)
+                            delta = event.get("choices", [{}])[0].get("delta", {}).get("content")
+                            if delta:
+                                yield delta
+                        except json.JSONDecodeError:
+                            self.agent.logger.error("Failed to parse LLM response: %s", data)

    async def setup(self):
        """
--- a/src/control_backend/agents/llm/llm_instructions.py
+++ b/src/control_backend/agents/llm/llm_instructions.py
@@ -28,7 +28,9 @@ class LLMInstructions:
        """
        sections = [
            "You are a Pepper robot engaging in natural human conversation.",
-            "Keep responses between 1–5 sentences, unless instructed otherwise.\n",
+            "Keep responses between 1–3 sentences, unless told otherwise.\n",
+            "You're given goals to reach. Reach them in order, but make the conversation feel "
+            "natural. Some turns you should not try to achieve your goals.\n",
        ]

        if self.norms:
--- a/src/control_backend/agents/ri_command_agent.py
+++ b/src/control_backend/agents/ri_command_agent.py
@@ -1,5 +1,6 @@
 import json

+import spade.agent
 import zmq
 from spade.behaviour import CyclicBehaviour
 from zmq.asyncio import Context
@@ -29,6 +30,8 @@ class RICommandAgent(BaseAgent):
        self.bind = bind

    class SendCommandsBehaviour(CyclicBehaviour):
+        """Behaviour for sending commands received from the UI."""
+
        async def run(self):
            """
            Run the command publishing loop indefinetely.
@@ -45,7 +48,19 @@ class RICommandAgent(BaseAgent):
                # Send to the robot.
                await self.agent.pubsocket.send_json(message.model_dump())
            except Exception as e:
-                self.logger.error("Error processing message: %s", e)
+                self.agent.logger.error("Error processing message: %s", e)
+
+    class SendPythonCommandsBehaviour(CyclicBehaviour):
+        """Behaviour for sending commands received from other Python agents."""
+
+        async def run(self):
+            message: spade.agent.Message = await self.receive(timeout=0.1)
+            if message and message.to == self.agent.jid:
+                try:
+                    speech_command = SpeechCommand.model_validate_json(message.body)
+                    await self.agent.pubsocket.send_json(speech_command.model_dump())
+                except Exception as e:
+                    self.agent.logger.error("Error processing message: %s", e)

    async def setup(self):
        """
@@ -70,5 +85,6 @@ class RICommandAgent(BaseAgent):
        # Add behaviour to our agent
        commands_behaviour = self.SendCommandsBehaviour()
        self.add_behaviour(commands_behaviour)
+        self.add_behaviour(self.SendPythonCommandsBehaviour())

        self.logger.info("Finished setting up %s", self.jid)
--- a/src/control_backend/agents/transcription/speech_recognizer.py
+++ b/src/control_backend/agents/transcription/speech_recognizer.py
@@ -36,16 +36,16 @@ class SpeechRecognizer(abc.ABC):
    def _estimate_max_tokens(audio: np.ndarray) -> int:
        """
        Estimate the maximum length of a given audio sample in tokens. Assumes a maximum speaking
-        rate of 300 words per minute (2x average), and assumes that 3 words is 4 tokens.
+        rate of 450 words per minute (3x average), and assumes that 3 words is 4 tokens.

        :param audio: The audio sample (16 kHz) to use for length estimation.
        :return: The estimated length of the transcribed audio in tokens.
        """
        length_seconds = len(audio) / 16_000
        length_minutes = length_seconds / 60
-        word_count = length_minutes * 300
+        word_count = length_minutes * 450
        token_count = word_count / 3 * 4
-        return int(token_count)
+        return int(token_count) + 10

    def _get_decode_options(self, audio: np.ndarray) -> dict:
        """
@@ -85,9 +85,10 @@ class MLXWhisperSpeechRecognizer(SpeechRecognizer):
    def recognize_speech(self, audio: np.ndarray) -> str:
        self.load_model()
        return mlx_whisper.transcribe(
-            audio, path_or_hf_repo=self.model_name, decode_options=self._get_decode_options(audio)
-        )["text"]
-        return mlx_whisper.transcribe(audio, path_or_hf_repo=self.model_name)["text"].strip()
+            audio,
+            path_or_hf_repo=self.model_name,
+            **self._get_decode_options(audio),
+        )["text"].strip()


 class OpenAIWhisperSpeechRecognizer(SpeechRecognizer):
@@ -103,6 +104,4 @@ class OpenAIWhisperSpeechRecognizer(SpeechRecognizer):

    def recognize_speech(self, audio: np.ndarray) -> str:
        self.load_model()
-        return whisper.transcribe(
-            self.model, audio, decode_options=self._get_decode_options(audio)
-        )["text"]
+        return whisper.transcribe(self.model, audio, **self._get_decode_options(audio))["text"]
--- a/src/control_backend/agents/transcription/transcription_agent.py
+++ b/src/control_backend/agents/transcription/transcription_agent.py
@@ -56,6 +56,10 @@ class TranscriptionAgent(BaseAgent):
            audio = await self.audio_in_socket.recv()
            audio = np.frombuffer(audio, dtype=np.float32)
            speech = await self._transcribe(audio)
+            if not speech:
+                self.agent.logger.info("Nothing transcribed.")
+                return
+
            self.agent.logger.info("Transcribed speech: %s", speech)

            await self._share_transcription(speech)
--- a/src/control_backend/agents/vad_agent.py
+++ b/src/control_backend/agents/vad_agent.py
@@ -51,8 +51,20 @@ class Streaming(CyclicBehaviour):

        self.audio_buffer = np.array([], dtype=np.float32)
        self.i_since_speech = 100  # Used to allow small pauses in speech
+        self._ready = False
+
+    async def reset(self):
+        """Clears the ZeroMQ queue and tells this behavior to start."""
+        discarded = 0
+        while await self.audio_in_poller.poll(1) is not None:
+            discarded += 1
+        self.agent.logger.info(f"Discarded {discarded} audio packets before starting.")
+        self._ready = True

    async def run(self) -> None:
+        if not self._ready:
+            return
+
        data = await self.audio_in_poller.poll()
        if data is None:
            if len(self.audio_buffer) > 0:
@@ -106,6 +118,8 @@ class VADAgent(BaseAgent):
        self.audio_in_socket: azmq.Socket | None = None
        self.audio_out_socket: azmq.Socket | None = None

+        self.streaming_behaviour: Streaming | None = None
+
    async def stop(self):
        """
        Stop listening to audio, stop publishing audio, close sockets.
@@ -148,8 +162,8 @@ class VADAgent(BaseAgent):
            return
        audio_out_address = f"tcp://localhost:{audio_out_port}"

-        streaming = Streaming(self.audio_in_socket, self.audio_out_socket)
-        self.add_behaviour(streaming)
+        self.streaming_behaviour = Streaming(self.audio_in_socket, self.audio_out_socket)
+        self.add_behaviour(self.streaming_behaviour)

        # Start agents dependent on the output audio fragments here
        transcriber = TranscriptionAgent(audio_out_address)
--- a/test/integration/agents/vad_agent/test_vad_with_audio.py
+++ b/test/integration/agents/vad_agent/test_vad_with_audio.py
@@ -48,6 +48,7 @@ async def test_real_audio(mocker):
    audio_out_socket = AsyncMock()

    vad_streamer = Streaming(audio_in_socket, audio_out_socket)
+    vad_streamer._ready = True
    for _ in audio_chunks:
        await vad_streamer.run()

--- a/test/unit/agents/test_vad_streaming.py
+++ b/test/unit/agents/test_vad_streaming.py
@@ -21,7 +21,9 @@ def streaming(audio_in_socket, audio_out_socket):
    import torch

    torch.hub.load.return_value = (..., ...)  # Mock
-    return Streaming(audio_in_socket, audio_out_socket)
+    streaming = Streaming(audio_in_socket, audio_out_socket)
+    streaming._ready = True
+    return streaming


 async def simulate_streaming_with_probabilities(streaming, probabilities: list[float]):
--- a/test/unit/agents/transcription/test_speech_recognizer.py
+++ b/test/unit/agents/transcription/test_speech_recognizer.py
@@ -5,12 +5,13 @@ from control_backend.agents.transcription.speech_recognizer import OpenAIWhisper


 def test_estimate_max_tokens():
-    """Inputting one minute of audio, assuming 300 words per minute, expecting 400 tokens."""
+    """Inputting one minute of audio, assuming 450 words per minute and adding a 10 token padding,
+    expecting 610 tokens."""
    audio = np.empty(shape=(60 * 16_000), dtype=np.float32)

    actual = SpeechRecognizer._estimate_max_tokens(audio)

-    assert actual == 400
+    assert actual == 610
    assert isinstance(actual, int)