2026-01-30 16:53:16 +00:00
5 changed files with 923 additions and 30 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,6 +7,7 @@ requires-python = ">=3.13"
 dependencies = [
  "agentspeak>=0.2.2",
  "colorlog>=6.10.1",
+  "deepface>=0.0.96",
  "fastapi[all]>=0.115.6",
  "mlx-whisper>=0.4.3 ; sys_platform == 'darwin'",
  "numpy>=2.3.3",
--- a/src/control_backend/agents/perception/visual_emotion_detection_agent/visual_emotion_recognition_agent.py
+++ b/src/control_backend/agents/perception/visual_emotion_detection_agent/visual_emotion_recognition_agent.py
@@ -0,0 +1,50 @@
+import asyncio
+import zmq
+import zmq.asyncio as azmq
+
+from control_backend.agents import BaseAgent
+from control_backend.agents.perception.visual_emotion_detection_agent.visual_emotion_recognizer import DeepFaceEmotionRecognizer
+from control_backend.core.agent_system import InternalMessage
+from control_backend.core.config import settings
+
+# START FROM RI?
+
+class VisualEmotionRecognitionAgent(BaseAgent):
+    def __init__(self, socket_address: str, socket_bind: bool = False, timeout_ms: int = 1000):
+        super().__init__(settings.agent_settings.visual_emotion_recognition_name)
+        self.socket_address = socket_address
+        self.socket_bind = socket_bind
+        self.timeout_ms = timeout_ms
+
+    async def setup(self):
+        self.logger.info("Setting up %s.", self.name)
+
+        self.emotion_recognizer = DeepFaceEmotionRecognizer()
+
+        self.video_in_socket = azmq.Context.instance().socket(zmq.SUB)
+
+        if self.socket_bind:
+            self.video_in_socket.bind(self.socket_address)
+        else:
+            self.video_in_socket.connect(self.socket_address)
+
+        self.video_in_socket.setsockopt_string(zmq.SUBSCRIBE, "")
+        self.video_in_socket.setsockopt(zmq.RCVTIMEO, self.timeout_ms)
+        self.video_in_socket.setsockopt(zmq.CONFLATE, 1)
+
+        self.add_behavior(self.retrieve_frame())
+
+    async def retrieve_frame(self):
+        """
+        Retrieve a video frame from the input socket.
+
+        :return: The received video frame, or None if timeout occurs.
+        """
+        await asyncio.sleep(1)  # Yield control to the event loop
+        try:
+            frame = await self.video_in_socket.recv()
+            # detected_emotions contains a list of dictionaries as follows:
+            detected_emotions = self.emotion_recognizer.detect(frame)
+        except zmq.Again:
+            self.logger.debug("No video frame received within timeout.")
+            return None
--- a/src/control_backend/agents/perception/visual_emotion_detection_agent/visual_emotion_recognizer.py
+++ b/src/control_backend/agents/perception/visual_emotion_detection_agent/visual_emotion_recognizer.py
@@ -0,0 +1,35 @@
+import abc 
+from deepface import DeepFace
+import numpy as np
+
+class VisualEmotionRecognizer(abc.ABC):
+    @abc.abstractmethod
+    def load_model(self):
+        """Load the visual emotion recognition model into memory."""
+        pass
+
+    @abc.abstractmethod
+    def detect(self, image):
+        """Recognize emotion from the given image.
+
+        :param image: The input image for emotion recognition.
+        :return: Detected emotion label.
+        """
+        pass
+
+class DeepFaceEmotionRecognizer(VisualEmotionRecognizer):
+    def __init__(self):
+        self.load_model()
+    
+    def load_model(self):
+        # Initialize DeepFace model for emotion recognition
+        print("Loading Deepface Emotion Model...")
+        dummy_img = np.zeros((224, 224, 3), dtype=np.uint8)
+        # analyze does not take a model as an argument, calling it once on a dummy image to load 
+        # the model
+        DeepFace.analyze(dummy_img, actions=['emotion'], enforce_detection=False)
+        print("Deepface Emotion Model loaded.")
+
+    def detect(self, image):
+        analysis = DeepFace.analyze(image, actions=['emotion'], enforce_detection=False)
+        return analysis['dominant_emotion']
--- a/src/control_backend/core/config.py
+++ b/src/control_backend/core/config.py
@@ -52,6 +52,7 @@ class AgentSettings(BaseModel):
    bdi_core_name: str = "bdi_core_agent"
    bdi_belief_collector_name: str = "belief_collector_agent"
    bdi_program_manager_name: str = "bdi_program_manager_agent"
+    visual_emotion_recognition_name: str = "visual_emotion_recognition_agent"
    text_belief_extractor_name: str = "text_belief_extractor_agent"
    vad_name: str = "vad_agent"
    llm_name: str = "llm_agent"
--- a/uv.lock
+++ b/uv.lock