feat: ignore own speech

When we detect that Pepper is talking we stop sending audio. ref: N25B-214
test: add case for microphone failure
2025-10-27 13:08:57 +01:00 · 2025-10-22 15:38:30 +02:00 · 2025-10-22 15:10:27 +02:00 · 2025-10-22 13:27:35 +02:00 · 2025-10-22 13:24:46 +02:00 · 2025-10-22 11:44:51 +02:00
15 changed files with 482 additions and 96 deletions
--- a/README.md
+++ b/README.md
@@ -1,6 +1,4 @@
-## Development environment
-### Linux (or WSL)
-Start off by installing [Pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) and walk through the steps outlined there (be sure to also add it to PATH). Also install the [Python build requirements](https://github.com/pyenv/pyenv/wiki#suggested-build-environment). Afterwards, install Python 2.7 and activate it for your current shell: 
+# PepperPlus-RI

 The robot interface is a high-level API for controlling the robot. It implements the API as designed: https://utrechtuniversity.youtrack.cloud/articles/N25B-A-14/RI-CB-Communication.

--- a/src/init.pyc
+++ b/src/init.pyc
--- a/src/audio_streaming.py
+++ b/src/audio_streaming.py
@@ -1,93 +0,0 @@
-import threading
-
-import pyaudio
-import zmq
-
-from state import state
-
-
-def choose_mic_interactive(audio):
-    """Choose a microphone to use. The `audio` parameter is an instance of PyAudio. Returns a dict."""
-    device_count = audio.get_device_count()
-    print("Found {} audio devices:".format(device_count))
-    for i in range(device_count):
-        print("- {}: {}".format(i, audio.get_device_info_by_index(i)["name"]))
-
-    microphone_index = None
-    while microphone_index is None:
-        chosen = input("Which device would you like to use?\n> ")
-        try:
-            chosen = int(chosen)
-            if chosen < 0 or chosen > device_count: raise ValueError()
-            microphone_index = chosen
-        except ValueError:
-            print("Please enter a number between 0 and {}".format(device_count))
-
-    chosen_microphone = audio.get_device_info_by_index(microphone_index)
-    print("Chose microphone \"{}\"".format(chosen_microphone["name"]))
-    return chosen_microphone
-
-
-def choose_mic_default(audio):
-    """Choose a microphone to use based on defaults. The `audio` parameter is a PyAudio. Returns a dict."""
-    default_device = audio.get_default_input_device_info()
-    return default_device
-
-
-class AudioStreaming:
-    def __init__(self, port=5557):
-        self.port = port
-        self.audio = pyaudio.PyAudio()
-        self.microphone = choose_mic_default(self.audio)
-        self.thread = None
-
-    def run(self):
-        self.thread = threading.Thread(target=self._stream)
-        self.thread.start()
-
-    def wait_until_done(self):
-        if not self.thread: return
-        self.thread.join()
-
-    def _stream(self):
-        context = zmq.Context()
-        socket = context.socket(zmq.PUB)
-        socket.bind("tcp://*:{}".format(self.port))
-
-        chunk = 512  # 320 at 16000 Hz is 20ms, 512 is required for Silero-VAD
-
-        stream = self.audio.open(
-            format=pyaudio.paFloat32,
-            channels=1,
-            rate=16000,
-            input=True,
-            input_device_index=self.microphone["index"],
-            frames_per_buffer=chunk,
-        )
-
-        try:
-            while not state.exit_event.is_set():
-                data = stream.read(chunk)
-                socket.send(data)
-        finally:
-            stream.stop_stream()
-            stream.close()
-
-
-if __name__ == "__main__":
-    state.initialize()
-    try:
-        audio = AudioStreaming()
-        print("Starting audio streaming...")
-        audio.run()
-
-        import time
-        end = time.time() + 10
-        while not state.exit_event.is_set() and time.time() < end:
-            print "\rExiting in {:.2f} seconds".format(end - time.time()),
-            time.sleep(0.05)
-
-        state.exit_event.set()
-        audio.wait_until_done()
-    finally:
-        state.deinitialize()
--- a/src/audio_streaming.pyc
+++ b/src/audio_streaming.pyc
--- a/src/robot_interface/endpoints/actuation_receiver.py
+++ b/src/robot_interface/endpoints/actuation_receiver.py
@@ -21,6 +21,7 @@ class ActuationReceiver(ReceiverBase):
        self.create_socket(zmq_context, zmq.SUB, port)
        self.socket.setsockopt_string(zmq.SUBSCRIBE, u"")  # Causes block if given in options
        self._tts_service = None
+        self._al_memory = None

    def _handle_speech(self, message):
        text = message.get("data")
@@ -40,10 +41,26 @@ class ActuationReceiver(ReceiverBase):

        if not self._tts_service:
            self._tts_service = state.qi_session.service("ALTextToSpeech")
+        if not self._al_memory:
+            self._al_memory = state.qi_session.service("ALMemory")
+
+            # Subscribe to speech end event
+            self.status_subscriber = self._al_memory.subscriber("ALTextToSpeech/Status")  # self because garbage collect
+            self.status_subscriber.signal.connect(self._on_status_changed)

        # Returns instantly. Messages received while speaking will be queued.
        qi.async(self._tts_service.say, text)

+    @staticmethod
+    def _on_status_changed(value):  # value will contain either 'enqueued', 'started' or 'done' depending on the status
+        """Callback function for when the speaking status changes. Will change the is_speaking value of the state."""
+        if "started" in value:
+            logging.debug("Started speaking.")
+            state.is_speaking = True
+        if "done" in value:
+            logging.debug("Done speaking.")
+            state.is_speaking = False
+
    def handle_message(self, message):
        if message["endpoint"] == "actuate/speech":
            self._handle_speech(message)
--- a/src/robot_interface/endpoints/audio_sender.py
+++ b/src/robot_interface/endpoints/audio_sender.py
@@ -0,0 +1,73 @@
+from __future__ import unicode_literals  # So that `logging` can use Unicode characters in names
+import threading
+import logging
+
+import pyaudio
+import zmq
+
+from robot_interface.endpoints.socket_base import SocketBase
+from robot_interface.state import state
+from robot_interface.utils.microphone import choose_mic_default
+
+
+logger = logging.getLogger(__name__)
+
+
+class AudioSender(SocketBase):
+    def __init__(self, zmq_context, port=5558):
+        super(AudioSender, self).__init__(str("audio"))  # Convert future's unicode_literal to str
+        self.create_socket(zmq_context, zmq.PUB, port)
+        self.audio = pyaudio.PyAudio()
+        self.microphone = choose_mic_default(self.audio)
+        self.thread = None
+
+    def start(self):
+        """
+        Start sending audio in a different thread.
+        """
+        if not self.microphone:
+            logger.info("Not listening: no microphone available.")
+            return
+
+        logger.info("Listening with microphone \"{}\".".format(self.microphone["name"]))
+        self.thread = threading.Thread(target=self._stream)
+        self.thread.start()
+
+    def wait_until_done(self):
+        """
+        Wait until the audio thread is done. Will only be done if `state.exit_event` is set, so
+        make sure to set that before calling this method or it will block.
+        """
+        if not self.thread: return
+        self.thread.join()
+        self.thread = None
+
+    def _stream(self):
+        chunk = 512  # 320 at 16000 Hz is 20ms, 512 is required for Silero-VAD
+
+        # Docs say this only raises an error if neither `input` nor `output` is True
+        stream = self.audio.open(
+            format=pyaudio.paFloat32,
+            channels=1,
+            rate=16000,
+            input=True,
+            input_device_index=self.microphone["index"],
+            frames_per_buffer=chunk,
+        )
+
+        try:
+            while not state.exit_event.is_set():
+                # Don't send audio if Pepper is speaking
+                if state.is_speaking:
+                    if stream.is_active(): stream.stop_stream()
+                    continue
+
+                if stream.is_stopped(): stream.start_stream()
+
+                data = stream.read(chunk)
+                self.socket.send(data)
+        except IOError as e:
+            logger.error("Stopped listening: failed to get audio from microphone.", exc_info=e)
+        finally:
+            stream.stop_stream()
+            stream.close()
--- a/src/robot_interface/main.py
+++ b/src/robot_interface/main.py
@@ -1,4 +1,7 @@
 import logging
+
+from robot_interface.endpoints.audio_sender import AudioSender
+
 logging.basicConfig(level=logging.DEBUG)

 import zmq
@@ -25,8 +28,11 @@ def main_loop(context):

    video_sender = VideoSender(context)
    state.sockets.append(video_sender)
+    audio_sender = AudioSender(context)
+    state.sockets.append(audio_sender)

    video_sender.start_video_rcv()
+    audio_sender.start()

    # Sockets that can run on the main thread. These sockets' endpoints should not block for long (say 50 ms at most).
    receivers = [main_receiver, actuation_receiver]
@@ -37,8 +43,19 @@ def main_loop(context):

    logging.debug("Starting main loop.")

+    import schedule
+    test_speaking_message = {"data": "Hi, my name is Pepper, and this is quite a long message."}
+    def test_speak():
+        logging.debug("Testing speech.")
+        actuation_receiver._handle_speech(test_speaking_message)
+
+    schedule.every(10).seconds.do(test_speak)
+
    while True:
        if state.exit_event.is_set(): break
+
+        schedule.run_pending()
+
        socks = dict(poller.poll(100))

        for receiver in receivers:
--- a/src/robot_interface/state.py
+++ b/src/robot_interface/state.py
@@ -18,6 +18,7 @@ class State(object):
        self.exit_event = None
        self.sockets = []  # type: List[SocketBase]
        self.qi_session = None  # type: None | ssl.SSLSession
+        self.is_speaking = False  # type: Boolean

    def initialize(self):
        if self.is_initialized:
--- a/src/robot_interface/utils/microphone.py
+++ b/src/robot_interface/utils/microphone.py
@@ -0,0 +1,69 @@
+from __future__ import unicode_literals  # So that `print` can print Unicode characters in names
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def get_microphones(audio):
+    """
+    Get audio devices which have input channels.
+
+    :param audio: An instance of PyAudio to use.
+    :type audio: pyaudio.PyAudio
+
+    :return: An interator of PaAudio dicts containing information about the microphone devices.
+    :rtype: Iterator[dict]
+    """
+    for i in range(audio.get_device_count()):
+        device = audio.get_device_info_by_index(i)
+        if device["maxInputChannels"] > 0:
+            yield device
+
+
+def choose_mic_interactive(audio):
+    """
+    Choose a microphone to use, interactively in the CLI.
+
+    :param audio: An instance of PyAudio to use.
+    :type audio: pyaudio.PyAudio
+
+    :return: A dictionary from PyAudio containing information about the microphone to use, or None
+    if there is no microphone.
+    :rtype: dict | None
+    """
+    microphones = list(get_microphones(audio))
+    if len(microphones) == 0: return None
+
+    print("Found {} microphones:".format(len(microphones)))
+    for i, mic in enumerate(microphones):
+        print("- {}: {}".format(i, mic["name"]))
+
+    chosen_microphone = None
+    while chosen_microphone is None:
+        chosen = raw_input("Which device would you like to use?\n> ")
+        try:
+            chosen = int(chosen)
+            if chosen < 0 or chosen >= len(microphones): raise ValueError()
+            chosen_microphone = microphones[chosen]
+        except ValueError:
+            print("Please enter a number between 0 and {}".format(len(microphones)-1))
+
+    logger.info("Chose microphone \"{}\"".format(chosen_microphone["name"]))
+    return chosen_microphone
+
+
+def choose_mic_default(audio):
+    """
+    Get the system's default microphone to use.
+
+    :param audio: An instance of PyAudio to use.
+    :type audio: pyaudio.PyAudio
+
+    :return: A dictionary from PyAudio containing information about the microphone to use, or None
+    if there is no microphone.
+    :rtype: dict | None
+    """
+    try:
+        return audio.get_default_input_device_info()
+    except IOError:
+        return None
--- a/test/common/init.py
+++ b/test/common/init.py
--- a/test/common/microphone_utils.py
+++ b/test/common/microphone_utils.py
@@ -0,0 +1,95 @@
+import random
+import sys
+from StringIO import StringIO
+
+from robot_interface.utils.microphone import choose_mic_default, choose_mic_interactive, get_microphones
+
+
+class MicrophoneUtils(object):
+    """Shared tests for any PyAudio-like implementation, e.g. mock and real."""
+
+    def test_choose_mic_default(self, pyaudio_instance):
+        """
+        The result must contain at least "index", as this is used to identify the microphone.
+        The "name" is used for logging, so it should also exist.
+        It must have one or more channels.
+        Lastly it must be capable of sending at least 16000 samples per second.
+        """
+        result = choose_mic_default(pyaudio_instance)
+        assert "index" in result
+        assert isinstance(result["index"], (int, long))
+
+        assert "name" in result
+        assert isinstance(result["name"], (str, unicode))
+
+        assert "maxInputChannels" in result
+        assert isinstance(result["maxInputChannels"], (int, long))
+        assert result["maxInputChannels"] > 0
+
+        assert "defaultSampleRate" in result
+        assert isinstance(result["defaultSampleRate"], float)
+        assert result["defaultSampleRate"] >= 16000
+
+    def test_choose_mic_interactive_input_not_int(self, pyaudio_instance, mocker):
+        """
+        First mock an input that's not an integer, then a valid integer. There should be no errors.
+        """
+        mock_input = mocker.patch("__builtin__.raw_input", side_effect=["not an integer", "0"])
+        fake_out = StringIO()
+        mocker.patch.object(sys, "stdout", fake_out)
+
+        result = choose_mic_interactive(pyaudio_instance)
+        assert "index" in result
+        assert isinstance(result["index"], (int, long))
+        assert result["index"] == 0
+
+        assert mock_input.called
+
+        assert any(p.startswith("Please enter a number") for p in fake_out.getvalue().splitlines())
+
+    def test_choose_mic_interactive_negative_index(self, pyaudio_instance, mocker):
+        """
+        Make sure that the interactive method does not allow negative integers as input.
+        """
+        mock_input = mocker.patch("__builtin__.raw_input", side_effect=["-1", "0"])
+        fake_out = StringIO()
+        mocker.patch.object(sys, "stdout", fake_out)
+
+        result = choose_mic_interactive(pyaudio_instance)
+        assert "index" in result
+        assert isinstance(result["index"], (int, long))
+        assert result["index"] == 0
+
+        assert mock_input.called
+
+        assert any(p.startswith("Please enter a number") for p in fake_out.getvalue().splitlines())
+
+    def test_choose_mic_interactive_index_too_high(self, pyaudio_instance, mocker):
+        """
+        Make sure that the interactive method does not allow indices higher than the highest mic index.
+        """
+        real_count = len(list(get_microphones(pyaudio_instance)))
+        mock_input = mocker.patch("__builtin__.raw_input", side_effect=[str(real_count), "0"])
+        fake_out = StringIO()
+        mocker.patch.object(sys, "stdout", fake_out)
+
+        result = choose_mic_interactive(pyaudio_instance)
+        assert "index" in result
+        assert isinstance(result["index"], (int, long))
+
+        assert mock_input.called
+
+        assert any(p.startswith("Please enter a number") for p in fake_out.getvalue().splitlines())
+
+    def test_choose_mic_interactive_random_index(self, pyaudio_instance, mocker):
+        """
+        Get a random index from the list of available mics, make sure it's correct.
+        """
+        microphones = list(get_microphones(pyaudio_instance))
+        random_index = random.randrange(len(microphones))
+        mocker.patch("__builtin__.raw_input", side_effect=[str(random_index)])
+
+        result = choose_mic_interactive(pyaudio_instance)
+        assert "index" in result
+        assert isinstance(result["index"], (int, long))
+        assert result["index"] == microphones[random_index]["index"]
--- a/test/integration/init.py
+++ b/test/integration/init.py
--- a/test/integration/test_microphone_utils.py
+++ b/test/integration/test_microphone_utils.py
@@ -0,0 +1,20 @@
+import pyaudio
+
+import pytest
+
+from common.microphone_utils import MicrophoneUtils
+
+
+@pytest.fixture
+def pyaudio_instance():
+    audio = pyaudio.PyAudio()
+    try:
+        audio.get_default_input_device_info()
+        return audio
+    except IOError:
+        pytest.skip("No microphone available to test with.")
+
+
+class TestAudioIntegration(MicrophoneUtils):
+    """Run shared audio behavior tests with the mock implementation."""
+    pass
--- a/test/unit/test_audio_sender.py
+++ b/test/unit/test_audio_sender.py
@@ -0,0 +1,104 @@
+# coding=utf-8
+import os
+import time
+
+import mock
+import pytest
+import zmq
+
+from robot_interface.endpoints.audio_sender import AudioSender
+
+
+@pytest.fixture
+def zmq_context():
+    context = zmq.Context()
+    yield context
+
+
+def test_no_microphone(zmq_context, mocker):
+    mock_info_logger = mocker.patch("robot_interface.endpoints.audio_sender.logger.info")
+    mock_choose_mic = mocker.patch("robot_interface.endpoints.audio_sender.choose_mic_default")
+    mock_choose_mic.return_value = None
+
+    sender = AudioSender(zmq_context)
+    assert sender.microphone is None
+
+    sender.start()
+    assert sender.thread is None
+    mock_info_logger.assert_called()
+
+    sender.wait_until_done()  # Should return early because we didn't start a thread
+
+
+def test_unicode_mic_name(zmq_context, mocker):
+    mocker.patch("robot_interface.endpoints.audio_sender.threading")
+    mock_choose_mic = mocker.patch("robot_interface.endpoints.audio_sender.choose_mic_default")
+    mock_choose_mic.return_value = {"name": u"• Some Unicode name"}
+
+    sender = AudioSender(zmq_context)
+    assert sender.microphone is not None
+
+    # `.start()` logs the name of the microphone. It should not give an error if it contains Unicode
+    #  symbols.
+    sender.start()
+    assert sender.thread is not None
+
+    sender.wait_until_done()  # Should return instantly because we didn't start a real thread
+
+
+def _fake_read(num_frames):
+    return os.urandom(num_frames * 4)
+
+
+def test_sending_audio(mocker):
+    mock_choose_mic = mocker.patch("robot_interface.endpoints.audio_sender.choose_mic_default")
+    mock_choose_mic.return_value = {"name": u"Some mic", "index": 0L}
+
+    mock_state = mocker.patch("robot_interface.endpoints.audio_sender.state")
+    mock_state.exit_event.is_set.side_effect = [False, True]
+
+    mock_zmq_context = mock.Mock()
+    send_socket = mock.Mock()
+
+    # If there's something wrong with the microphone, it will raise an IOError when `read`ing.
+    stream = mock.Mock()
+    stream.read = _fake_read
+
+    sender = AudioSender(mock_zmq_context)
+    sender.socket.send = send_socket
+    sender.audio.open = mock.Mock()
+    sender.audio.open.return_value = stream
+
+    sender.start()
+    sender.wait_until_done()
+
+    send_socket.assert_called()
+
+
+def _fake_read_error(num_frames):
+    raise IOError()
+
+
+def test_break_microphone(mocker):
+    mock_choose_mic = mocker.patch("robot_interface.endpoints.audio_sender.choose_mic_default")
+    mock_choose_mic.return_value = {"name": u"Some mic", "index": 0L}
+
+    mock_state = mocker.patch("robot_interface.endpoints.audio_sender.state")
+    mock_state.exit_event.is_set.side_effect = [False, True]
+
+    mock_zmq_context = mock.Mock()
+    send_socket = mock.Mock()
+
+    # If there's something wrong with the microphone, it will raise an IOError when `read`ing.
+    stream = mock.Mock()
+    stream.read = _fake_read_error
+
+    sender = AudioSender(mock_zmq_context)
+    sender.socket.send = send_socket
+    sender.audio.open = mock.Mock()
+    sender.audio.open.return_value = stream
+
+    sender.start()
+    sender.wait_until_done()
+
+    send_socket.assert_not_called()
--- a/test/unit/test_microphone_utils.py
+++ b/test/unit/test_microphone_utils.py
@@ -0,0 +1,85 @@
+# coding=utf-8
+import mock
+import pytest
+
+from common.microphone_utils import MicrophoneUtils
+from robot_interface.utils.microphone import choose_mic_default, choose_mic_interactive
+
+
+class MockPyAudio:
+    def __init__(self):
+        # You can predefine fake device info here
+        self.devices = [
+            {
+                "index": 0,
+                "name": u"Someone’s Microphone",  # Using a Unicode ’ character
+                "maxInputChannels": 2,
+                "maxOutputChannels": 0,
+                "defaultSampleRate": 44100.0,
+                "defaultLowInputLatency": 0.01,
+                "defaultLowOutputLatency": 0.01,
+                "defaultHighInputLatency": 0.1,
+                "defaultHighOutputLatency": 0.1,
+                "hostApi": 0,
+            },
+            {
+                "index": 1,
+                "name": u"Mock Speaker 1",
+                "maxInputChannels": 0,
+                "maxOutputChannels": 2,
+                "defaultSampleRate": 48000.0,
+                "defaultLowInputLatency": 0.01,
+                "defaultLowOutputLatency": 0.01,
+                "defaultHighInputLatency": 0.1,
+                "defaultHighOutputLatency": 0.1,
+                "hostApi": 0,
+            },
+        ]
+
+    def get_device_count(self):
+        """Return the number of available mock devices."""
+        return len(self.devices)
+
+    def get_device_info_by_index(self, index):
+        """Return information for a given mock device index."""
+        if 0 <= index < len(self.devices):
+            return self.devices[index]
+        else:
+            raise IOError("Invalid device index: {}".format(index))
+
+    def get_default_input_device_info(self):
+        """Return info for a default mock input device."""
+        for device in self.devices:
+            if device.get("maxInputChannels", 0) > 0:
+                return device
+        raise IOError("No default input device found")
+
+
+@pytest.fixture
+def pyaudio_instance():
+    return MockPyAudio()
+
+
+def _raise_io_error():
+    raise IOError()
+
+
+class TestAudioUnit(MicrophoneUtils):
+    """Run shared audio behavior tests with the mock implementation."""
+    def test_choose_mic_default_no_mic(self):
+        mock_pyaudio = mock.Mock()
+        mock_pyaudio.get_device_count = mock.Mock(return_value=0L)
+        mock_pyaudio.get_default_input_device_info = _raise_io_error
+
+        result = choose_mic_default(mock_pyaudio)
+
+        assert result is None
+
+    def test_choose_mic_interactive_no_mic(self):
+        mock_pyaudio = mock.Mock()
+        mock_pyaudio.get_device_count = mock.Mock(return_value=0L)
+        mock_pyaudio.get_default_input_device_info = _raise_io_error
+
+        result = choose_mic_interactive(mock_pyaudio)
+
+        assert result is None
Author	SHA1	Message	Date
Kasper	b4814d431f	feat: ignore own speech When we detect that Pepper is talking we stop sending audio. ref: N25B-214	2025-10-27 13:08:57 +01:00
Twirre Meulenbelt	230ab5d5cc	test: add case for microphone failure When the microphone fails, it will raise an IOError during the `read`. This is simulated with a new test. ref: N25B-119	2025-10-22 15:38:30 +02:00
Twirre Meulenbelt	0499cd8a24	feat: send audio AudioSender runs in a separate thread to send audio from the microphone. ref: N25B-119	2025-10-22 15:10:27 +02:00
Twirre Meulenbelt	f8db719bfa	test: unit test mock PyAudio, integration test use real Make unit tests use a mock version of PyAudio, while making integration tests using the real version. If no real microphone is available, these integration tests are skipped. ref: N25B-119	2025-10-22 13:27:35 +02:00
Twirre Meulenbelt	1e3e077029	fix: disallow selecting non-microphone audio device Previously any audio device was allowed to be selected as microphone. Now, only ones with at least one input channel can be selected. ref: N25B-119	2025-10-22 13:24:46 +02:00
Twirre Meulenbelt	0f60f67ab9	feat: add microphone selection utils Providing two functions, one to choose the default microphone, the other to choose a microphone interactively. With tests. ref: N25B-119	2025-10-22 11:44:51 +02:00