Merge branch 'dev' into refactor/logging
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
from spade.behaviour import CyclicBehaviour
|
||||
from spade.message import Message
|
||||
|
||||
from control_backend.core.config import settings
|
||||
from control_backend.schemas.ri_message import SpeechCommand
|
||||
|
||||
|
||||
class ReceiveLLMResponseBehaviour(CyclicBehaviour):
|
||||
@@ -16,7 +18,20 @@ class ReceiveLLMResponseBehaviour(CyclicBehaviour):
|
||||
case settings.agent_settings.llm_agent_name:
|
||||
content = msg.body
|
||||
self.agent.logger.info("Received LLM response: %s", content)
|
||||
# Here the BDI can pass the message back as a response
|
||||
|
||||
speech_command = SpeechCommand(data=content)
|
||||
|
||||
message = Message(
|
||||
to=settings.agent_settings.ri_command_agent_name
|
||||
+ "@"
|
||||
+ settings.agent_settings.host,
|
||||
sender=self.agent.jid,
|
||||
body=speech_command.model_dump_json(),
|
||||
)
|
||||
|
||||
self.agent.logger.debug("Sending message: %s", message)
|
||||
|
||||
await self.send(message)
|
||||
case _:
|
||||
self.agent.logger.debug("Discarding message from %s", sender)
|
||||
pass
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
from typing import Any
|
||||
import json
|
||||
import re
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
import httpx
|
||||
from spade.behaviour import CyclicBehaviour
|
||||
@@ -45,11 +47,16 @@ class LLMAgent(BaseAgent):
|
||||
|
||||
async def _process_bdi_message(self, message: Message):
|
||||
"""
|
||||
Forwards user text to the LLM and replies with the generated text.
|
||||
Forwards user text from the BDI to the LLM and replies with the generated text in chunks
|
||||
separated by punctuation.
|
||||
"""
|
||||
user_text = message.body
|
||||
llm_response = await self._query_llm(user_text)
|
||||
await self._reply(llm_response)
|
||||
# Consume the streaming generator and send a reply for every chunk
|
||||
async for chunk in self._query_llm(user_text):
|
||||
await self._reply(chunk)
|
||||
self.agent.logger.debug(
|
||||
"Finished processing BDI message. Response sent in chunks to BDI Core Agent."
|
||||
)
|
||||
|
||||
async def _reply(self, msg: str):
|
||||
"""
|
||||
@@ -60,48 +67,89 @@ class LLMAgent(BaseAgent):
|
||||
body=msg,
|
||||
)
|
||||
await self.send(reply)
|
||||
self.agent.logger.info("Reply sent to BDI Core Agent")
|
||||
|
||||
async def _query_llm(self, prompt: str) -> str:
|
||||
async def _query_llm(self, prompt: str) -> AsyncGenerator[str]:
|
||||
"""
|
||||
Sends a chat completion request to the local LLM service.
|
||||
Sends a chat completion request to the local LLM service and streams the response by
|
||||
yielding fragments separated by punctuation like.
|
||||
|
||||
:param prompt: Input text prompt to pass to the LLM.
|
||||
:return: LLM-generated content or fallback message.
|
||||
:yield: Fragments of the LLM-generated content.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
# Example dynamic content for future (optional)
|
||||
instructions = LLMInstructions(
|
||||
"- Be friendly and respectful.\n"
|
||||
"- Make the conversation feel natural and engaging.\n"
|
||||
"- Speak like a pirate.\n"
|
||||
"- When the user asks what you can do, tell them.",
|
||||
"- Try to learn the user's name during conversation.\n"
|
||||
"- Suggest playing a game of asking yes or no questions where you think of a word "
|
||||
"and the user must guess it.",
|
||||
)
|
||||
messages = [
|
||||
{
|
||||
"role": "developer",
|
||||
"content": instructions.build_developer_instruction(),
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
},
|
||||
]
|
||||
|
||||
instructions = LLMInstructions()
|
||||
developer_instruction = instructions.build_developer_instruction()
|
||||
try:
|
||||
current_chunk = ""
|
||||
async for token in self._stream_query_llm(messages):
|
||||
current_chunk += token
|
||||
|
||||
response = await client.post(
|
||||
# Stream the message in chunks separated by punctuation.
|
||||
# We include the delimiter in the emitted chunk for natural flow.
|
||||
pattern = re.compile(r".*?(?:,|;|:|—|–|\.{3}|…|\.|\?|!)\s*", re.DOTALL)
|
||||
for m in pattern.finditer(current_chunk):
|
||||
chunk = m.group(0)
|
||||
if chunk:
|
||||
yield current_chunk
|
||||
current_chunk = ""
|
||||
|
||||
# Yield any remaining tail
|
||||
if current_chunk:
|
||||
yield current_chunk
|
||||
except httpx.HTTPError as err:
|
||||
self.agent.logger.error("HTTP error.", exc_info=err)
|
||||
yield "LLM service unavailable."
|
||||
except Exception as err:
|
||||
self.agent.logger.error("Unexpected error.", exc_info=err)
|
||||
yield "Error processing the request."
|
||||
|
||||
async def _stream_query_llm(self, messages) -> AsyncGenerator[str]:
|
||||
"""Raises httpx.HTTPError when the API gives an error."""
|
||||
async with httpx.AsyncClient(timeout=None) as client:
|
||||
async with client.stream(
|
||||
"POST",
|
||||
settings.llm_settings.local_llm_url,
|
||||
headers={"Content-Type": "application/json"},
|
||||
json={
|
||||
"model": settings.llm_settings.local_llm_model,
|
||||
"messages": [
|
||||
{"role": "developer", "content": developer_instruction},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
"messages": messages,
|
||||
"temperature": 0.3,
|
||||
"stream": True,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
data: dict[str, Any] = response.json()
|
||||
return (
|
||||
data.get("choices", [{}])[0]
|
||||
.get("message", {})
|
||||
.get("content", "No response")
|
||||
)
|
||||
except httpx.HTTPError as err:
|
||||
self.agent.logger.error("HTTP error: %s", err)
|
||||
return "LLM service unavailable."
|
||||
except Exception as err:
|
||||
self.agent.logger.error("Unexpected error: %s", err)
|
||||
return "Error processing the request."
|
||||
|
||||
async for line in response.aiter_lines():
|
||||
if not line or not line.startswith("data: "):
|
||||
continue
|
||||
|
||||
data = line[len("data: ") :]
|
||||
if data.strip() == "[DONE]":
|
||||
break
|
||||
|
||||
try:
|
||||
event = json.loads(data)
|
||||
delta = event.get("choices", [{}])[0].get("delta", {}).get("content")
|
||||
if delta:
|
||||
yield delta
|
||||
except json.JSONDecodeError:
|
||||
self.agent.logger.error("Failed to parse LLM response: %s", data)
|
||||
|
||||
async def setup(self):
|
||||
"""
|
||||
|
||||
@@ -28,7 +28,9 @@ class LLMInstructions:
|
||||
"""
|
||||
sections = [
|
||||
"You are a Pepper robot engaging in natural human conversation.",
|
||||
"Keep responses between 1–5 sentences, unless instructed otherwise.\n",
|
||||
"Keep responses between 1–3 sentences, unless told otherwise.\n",
|
||||
"You're given goals to reach. Reach them in order, but make the conversation feel "
|
||||
"natural. Some turns you should not try to achieve your goals.\n",
|
||||
]
|
||||
|
||||
if self.norms:
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import json
|
||||
|
||||
import spade.agent
|
||||
import zmq
|
||||
from spade.behaviour import CyclicBehaviour
|
||||
from zmq.asyncio import Context
|
||||
@@ -29,6 +30,8 @@ class RICommandAgent(BaseAgent):
|
||||
self.bind = bind
|
||||
|
||||
class SendCommandsBehaviour(CyclicBehaviour):
|
||||
"""Behaviour for sending commands received from the UI."""
|
||||
|
||||
async def run(self):
|
||||
"""
|
||||
Run the command publishing loop indefinetely.
|
||||
@@ -45,7 +48,19 @@ class RICommandAgent(BaseAgent):
|
||||
# Send to the robot.
|
||||
await self.agent.pubsocket.send_json(message.model_dump())
|
||||
except Exception as e:
|
||||
self.logger.error("Error processing message: %s", e)
|
||||
self.agent.logger.error("Error processing message: %s", e)
|
||||
|
||||
class SendPythonCommandsBehaviour(CyclicBehaviour):
|
||||
"""Behaviour for sending commands received from other Python agents."""
|
||||
|
||||
async def run(self):
|
||||
message: spade.agent.Message = await self.receive(timeout=0.1)
|
||||
if message and message.to == self.agent.jid:
|
||||
try:
|
||||
speech_command = SpeechCommand.model_validate_json(message.body)
|
||||
await self.agent.pubsocket.send_json(speech_command.model_dump())
|
||||
except Exception as e:
|
||||
self.agent.logger.error("Error processing message: %s", e)
|
||||
|
||||
async def setup(self):
|
||||
"""
|
||||
@@ -70,5 +85,6 @@ class RICommandAgent(BaseAgent):
|
||||
# Add behaviour to our agent
|
||||
commands_behaviour = self.SendCommandsBehaviour()
|
||||
self.add_behaviour(commands_behaviour)
|
||||
self.add_behaviour(self.SendPythonCommandsBehaviour())
|
||||
|
||||
self.logger.info("Finished setting up %s", self.jid)
|
||||
|
||||
@@ -36,16 +36,16 @@ class SpeechRecognizer(abc.ABC):
|
||||
def _estimate_max_tokens(audio: np.ndarray) -> int:
|
||||
"""
|
||||
Estimate the maximum length of a given audio sample in tokens. Assumes a maximum speaking
|
||||
rate of 300 words per minute (2x average), and assumes that 3 words is 4 tokens.
|
||||
rate of 450 words per minute (3x average), and assumes that 3 words is 4 tokens.
|
||||
|
||||
:param audio: The audio sample (16 kHz) to use for length estimation.
|
||||
:return: The estimated length of the transcribed audio in tokens.
|
||||
"""
|
||||
length_seconds = len(audio) / 16_000
|
||||
length_minutes = length_seconds / 60
|
||||
word_count = length_minutes * 300
|
||||
word_count = length_minutes * 450
|
||||
token_count = word_count / 3 * 4
|
||||
return int(token_count)
|
||||
return int(token_count) + 10
|
||||
|
||||
def _get_decode_options(self, audio: np.ndarray) -> dict:
|
||||
"""
|
||||
@@ -85,9 +85,10 @@ class MLXWhisperSpeechRecognizer(SpeechRecognizer):
|
||||
def recognize_speech(self, audio: np.ndarray) -> str:
|
||||
self.load_model()
|
||||
return mlx_whisper.transcribe(
|
||||
audio, path_or_hf_repo=self.model_name, decode_options=self._get_decode_options(audio)
|
||||
)["text"]
|
||||
return mlx_whisper.transcribe(audio, path_or_hf_repo=self.model_name)["text"].strip()
|
||||
audio,
|
||||
path_or_hf_repo=self.model_name,
|
||||
**self._get_decode_options(audio),
|
||||
)["text"].strip()
|
||||
|
||||
|
||||
class OpenAIWhisperSpeechRecognizer(SpeechRecognizer):
|
||||
@@ -103,6 +104,4 @@ class OpenAIWhisperSpeechRecognizer(SpeechRecognizer):
|
||||
|
||||
def recognize_speech(self, audio: np.ndarray) -> str:
|
||||
self.load_model()
|
||||
return whisper.transcribe(
|
||||
self.model, audio, decode_options=self._get_decode_options(audio)
|
||||
)["text"]
|
||||
return whisper.transcribe(self.model, audio, **self._get_decode_options(audio))["text"]
|
||||
|
||||
@@ -56,6 +56,10 @@ class TranscriptionAgent(BaseAgent):
|
||||
audio = await self.audio_in_socket.recv()
|
||||
audio = np.frombuffer(audio, dtype=np.float32)
|
||||
speech = await self._transcribe(audio)
|
||||
if not speech:
|
||||
self.agent.logger.info("Nothing transcribed.")
|
||||
return
|
||||
|
||||
self.agent.logger.info("Transcribed speech: %s", speech)
|
||||
|
||||
await self._share_transcription(speech)
|
||||
|
||||
@@ -51,8 +51,20 @@ class Streaming(CyclicBehaviour):
|
||||
|
||||
self.audio_buffer = np.array([], dtype=np.float32)
|
||||
self.i_since_speech = 100 # Used to allow small pauses in speech
|
||||
self._ready = False
|
||||
|
||||
async def reset(self):
|
||||
"""Clears the ZeroMQ queue and tells this behavior to start."""
|
||||
discarded = 0
|
||||
while await self.audio_in_poller.poll(1) is not None:
|
||||
discarded += 1
|
||||
self.agent.logger.info(f"Discarded {discarded} audio packets before starting.")
|
||||
self._ready = True
|
||||
|
||||
async def run(self) -> None:
|
||||
if not self._ready:
|
||||
return
|
||||
|
||||
data = await self.audio_in_poller.poll()
|
||||
if data is None:
|
||||
if len(self.audio_buffer) > 0:
|
||||
@@ -106,6 +118,8 @@ class VADAgent(BaseAgent):
|
||||
self.audio_in_socket: azmq.Socket | None = None
|
||||
self.audio_out_socket: azmq.Socket | None = None
|
||||
|
||||
self.streaming_behaviour: Streaming | None = None
|
||||
|
||||
async def stop(self):
|
||||
"""
|
||||
Stop listening to audio, stop publishing audio, close sockets.
|
||||
@@ -148,8 +162,8 @@ class VADAgent(BaseAgent):
|
||||
return
|
||||
audio_out_address = f"tcp://localhost:{audio_out_port}"
|
||||
|
||||
streaming = Streaming(self.audio_in_socket, self.audio_out_socket)
|
||||
self.add_behaviour(streaming)
|
||||
self.streaming_behaviour = Streaming(self.audio_in_socket, self.audio_out_socket)
|
||||
self.add_behaviour(self.streaming_behaviour)
|
||||
|
||||
# Start agents dependent on the output audio fragments here
|
||||
transcriber = TranscriptionAgent(audio_out_address)
|
||||
|
||||
Reference in New Issue
Block a user