Merge branch 'feat/stream-audio' into 'dev'

Implement audio streaming

See merge request ics/sp/2025/n25b/pepperplus-ri!8
This commit was merged in pull request #8.
This commit is contained in:
Pim Hutting
2025-11-05 12:08:28 +00:00
12 changed files with 574 additions and 1 deletions

View File

@@ -34,6 +34,18 @@ python -m virtualenv .venv
source .venv/bin/activate source .venv/bin/activate
``` ```
We depend on PortAudio for the `pyaudio` package, so install it with:
```bash
sudo apt install -y portaudio19-dev
```
On WSL, also install:
```bash
sudo apt install -y libasound2-plugins
```
Install the required packages with Install the required packages with
```bash ```bash
@@ -98,6 +110,8 @@ $env:PYTHONPATH="src"; python -m robot_interface.main
With both, if you want to connect to the actual robot (or simulator), pass the `--qi-url` argument. With both, if you want to connect to the actual robot (or simulator), pass the `--qi-url` argument.
There's also a `--microphone` argument that can be used to choose a microphone to use. If not given, the program will try the default microphone. If you don't know the name of the microphone, pass the argument with any value, and it will list the names of available microphones.
## Testing ## Testing

View File

@@ -1,3 +1,4 @@
from __future__ import unicode_literals # So that we can log texts with Unicode characters
import logging import logging
import zmq import zmq

View File

@@ -0,0 +1,72 @@
from __future__ import unicode_literals # So that `logging` can use Unicode characters in names
import threading
import logging
import pyaudio
import zmq
from robot_interface.endpoints.socket_base import SocketBase
from robot_interface.state import state
from robot_interface.utils.microphone import choose_mic
logger = logging.getLogger(__name__)
class AudioSender(SocketBase):
def __init__(self, zmq_context, port=5558):
super(AudioSender, self).__init__(str("audio")) # Convert future's unicode_literal to str
self.create_socket(zmq_context, zmq.PUB, port)
self.thread = None
try:
self.audio = pyaudio.PyAudio()
self.microphone = choose_mic(self.audio)
except IOError as e:
logger.warning("PyAudio is not available.", exc_info=e)
self.audio = None
self.microphone = None
def start(self):
"""
Start sending audio in a different thread.
"""
if not self.microphone:
logger.info("Not listening: no microphone available.")
return
logger.info("Listening with microphone \"{}\".".format(self.microphone["name"]))
self.thread = threading.Thread(target=self._stream)
self.thread.start()
def wait_until_done(self):
"""
Wait until the audio thread is done. Will only be done if `state.exit_event` is set, so
make sure to set that before calling this method or it will block.
"""
if not self.thread: return
self.thread.join()
self.thread = None
def _stream(self):
chunk = 512 # 320 at 16000 Hz is 20ms, 512 is required for Silero-VAD
# Docs say this only raises an error if neither `input` nor `output` is True
stream = self.audio.open(
format=pyaudio.paFloat32,
channels=1,
rate=16000,
input=True,
input_device_index=self.microphone["index"],
frames_per_buffer=chunk,
)
try:
while not state.exit_event.is_set():
data = stream.read(chunk)
self.socket.send(data)
except IOError as e:
logger.error("Stopped listening: failed to get audio from microphone.", exc_info=e)
finally:
stream.stop_stream()
stream.close()

View File

@@ -1,6 +1,5 @@
import zmq import zmq
import threading import threading
import qi
import logging import logging
from robot_interface.endpoints.socket_base import SocketBase from robot_interface.endpoints.socket_base import SocketBase

View File

@@ -1,4 +1,7 @@
import logging import logging
from robot_interface.endpoints.audio_sender import AudioSender
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
import zmq import zmq
@@ -25,8 +28,11 @@ def main_loop(context):
video_sender = VideoSender(context) video_sender = VideoSender(context)
state.sockets.append(video_sender) state.sockets.append(video_sender)
audio_sender = AudioSender(context)
state.sockets.append(audio_sender)
video_sender.start_video_rcv() video_sender.start_video_rcv()
audio_sender.start()
# Sockets that can run on the main thread. These sockets' endpoints should not block for long (say 50 ms at most). # Sockets that can run on the main thread. These sockets' endpoints should not block for long (say 50 ms at most).
receivers = [main_receiver, actuation_receiver] receivers = [main_receiver, actuation_receiver]

View File

@@ -0,0 +1,120 @@
from __future__ import unicode_literals # So that `print` can print Unicode characters in names
import logging
import sys
logger = logging.getLogger(__name__)
def get_microphones(audio):
"""
Get audio devices which have input channels.
:param audio: An instance of PyAudio to use.
:type audio: pyaudio.PyAudio
:return: An interator of PaAudio dicts containing information about the microphone devices.
:rtype: Iterator[dict]
"""
for i in range(audio.get_device_count()):
device = audio.get_device_info_by_index(i)
if device["maxInputChannels"] > 0:
yield device
def choose_mic_interactive(audio):
"""
Choose a microphone to use, interactively in the CLI.
:param audio: An instance of PyAudio to use.
:type audio: pyaudio.PyAudio
:return: A dictionary from PyAudio containing information about the microphone to use, or None
if there is no microphone.
:rtype: dict | None
"""
microphones = list(get_microphones(audio))
if len(microphones) == 0: return None
print("Found {} microphones:".format(len(microphones)))
for i, mic in enumerate(microphones):
print("- {}: {}".format(i, mic["name"]))
chosen_microphone = None
while chosen_microphone is None:
chosen = raw_input("Which device would you like to use?\n> ")
try:
chosen = int(chosen)
if chosen < 0 or chosen >= len(microphones): raise ValueError()
chosen_microphone = microphones[chosen]
except ValueError:
print("Please enter a number between 0 and {}".format(len(microphones)-1))
logger.info("Chose microphone \"{}\"".format(chosen_microphone["name"]))
return chosen_microphone
def choose_mic_default(audio):
"""
Get the system's default microphone to use.
:param audio: An instance of PyAudio to use.
:type audio: pyaudio.PyAudio
:return: A dictionary from PyAudio containing information about the microphone to use, or None
if there is no microphone.
:rtype: dict | None
"""
try:
return audio.get_default_input_device_info()
except IOError:
return None
def choose_mic_arguments(audio):
"""
Get a microphone to use from command line arguments.
:param audio: An instance of PyAudio to use.
:type audio: pyaudio.PyAudio
:return: A dictionary from PyAudio containing information about the microphone to use, or None
if there is no microphone satisfied by the arguments.
:rtype: dict | None
"""
microphone_name = None
for i, arg in enumerate(sys.argv):
if arg == "--microphone" and len(sys.argv) > i+1:
microphone_name = sys.argv[i+1].strip()
if arg.startswith("--microphone="):
microphone_name = arg[13:].strip()
if not microphone_name: return None
available_mics = list(get_microphones(audio))
for mic in available_mics:
if mic["name"] == microphone_name:
return mic
available_mic_names = [mic["name"] for mic in available_mics]
logger.warning("Microphone \"{}\" not found. Choose one of {}"
.format(microphone_name, available_mic_names))
return None
def choose_mic(audio):
"""
Get a microphone to use. Firstly, tries to see if there's an application argument specifying the
microphone to use. If not, get the default microphone.
:param audio: An instance of PyAudio to use.
:type audio: pyaudio.PyAudio
:return: A dictionary from PyAudio containing information about the microphone to use, or None
if there is no microphone.
:rtype: dict | None
"""
chosen_mic = choose_mic_arguments(audio)
if chosen_mic: return chosen_mic
return choose_mic_default(audio)

0
test/common/__init__.py Normal file
View File

View File

@@ -0,0 +1,152 @@
from __future__ import unicode_literals # So that we can format strings with Unicode characters
import random
import sys
from StringIO import StringIO
from robot_interface.utils.microphone import (
choose_mic_default,
choose_mic_interactive,
choose_mic_arguments,
choose_mic,
get_microphones,
)
class MicrophoneUtils(object):
"""Shared tests for any PyAudio-like implementation, e.g. mock and real."""
def test_choose_mic_default(self, pyaudio_instance):
"""
The result must contain at least "index", as this is used to identify the microphone.
The "name" is used for logging, so it should also exist.
It must have one or more channels.
Lastly it must be capable of sending at least 16000 samples per second.
"""
result = choose_mic_default(pyaudio_instance)
assert "index" in result
assert isinstance(result["index"], (int, long))
assert "name" in result
assert isinstance(result["name"], (str, unicode))
assert "maxInputChannels" in result
assert isinstance(result["maxInputChannels"], (int, long))
assert result["maxInputChannels"] > 0
assert "defaultSampleRate" in result
assert isinstance(result["defaultSampleRate"], float)
assert result["defaultSampleRate"] >= 16000
def test_choose_mic_interactive_input_not_int(self, pyaudio_instance, mocker):
"""
First mock an input that's not an integer, then a valid integer. There should be no errors.
"""
mock_input = mocker.patch("__builtin__.raw_input", side_effect=["not an integer", "0"])
fake_out = StringIO()
mocker.patch.object(sys, "stdout", fake_out)
result = choose_mic_interactive(pyaudio_instance)
assert "index" in result
assert isinstance(result["index"], (int, long))
assert result["index"] == 0
assert mock_input.called
assert any(p.startswith("Please enter a number") for p in fake_out.getvalue().splitlines())
def test_choose_mic_interactive_negative_index(self, pyaudio_instance, mocker):
"""
Make sure that the interactive method does not allow negative integers as input.
"""
mock_input = mocker.patch("__builtin__.raw_input", side_effect=["-1", "0"])
fake_out = StringIO()
mocker.patch.object(sys, "stdout", fake_out)
result = choose_mic_interactive(pyaudio_instance)
assert "index" in result
assert isinstance(result["index"], (int, long))
assert result["index"] == 0
assert mock_input.called
assert any(p.startswith("Please enter a number") for p in fake_out.getvalue().splitlines())
def test_choose_mic_interactive_index_too_high(self, pyaudio_instance, mocker):
"""
Make sure that the interactive method does not allow indices higher than the highest mic index.
"""
real_count = len(list(get_microphones(pyaudio_instance)))
mock_input = mocker.patch("__builtin__.raw_input", side_effect=[str(real_count), "0"])
fake_out = StringIO()
mocker.patch.object(sys, "stdout", fake_out)
result = choose_mic_interactive(pyaudio_instance)
assert "index" in result
assert isinstance(result["index"], (int, long))
assert mock_input.called
assert any(p.startswith("Please enter a number") for p in fake_out.getvalue().splitlines())
def test_choose_mic_interactive_random_index(self, pyaudio_instance, mocker):
"""
Get a random index from the list of available mics, make sure it's correct.
"""
microphones = list(get_microphones(pyaudio_instance))
random_index = random.randrange(len(microphones))
mocker.patch("__builtin__.raw_input", side_effect=[str(random_index)])
result = choose_mic_interactive(pyaudio_instance)
assert "index" in result
assert isinstance(result["index"], (int, long))
assert result["index"] == microphones[random_index]["index"]
def test_choose_mic_no_arguments(self, pyaudio_instance, mocker):
mocker.patch.object(sys, "argv", [])
result = choose_mic_arguments(pyaudio_instance)
assert result is None
def test_choose_mic_arguments(self, pyaudio_instance, mocker):
for mic in get_microphones(pyaudio_instance):
mocker.patch.object(sys, "argv", ["--microphone", mic["name"]])
result = choose_mic_arguments(pyaudio_instance)
assert result is not None
assert result == mic
def test_choose_mic_arguments_eq(self, pyaudio_instance, mocker):
for mic in get_microphones(pyaudio_instance):
mocker.patch.object(sys, "argv", ["--microphone={}".format(mic["name"])])
result = choose_mic_arguments(pyaudio_instance)
assert result is not None
assert result == mic
def test_choose_mic_arguments_not_exits(self, pyaudio_instance, mocker):
mocker.patch.object(sys, "argv", ["--microphone", "Surely this microphone doesn't exist"])
result = choose_mic_arguments(pyaudio_instance)
assert result is None
def test_choose_mic_with_argument(self, pyaudio_instance, mocker):
mic = next(get_microphones(pyaudio_instance))
mocker.patch.object(sys, "argv", ["--microphone", mic["name"]])
result = choose_mic(pyaudio_instance)
assert result is not None
assert result == mic
def test_choose_mic_no_argument(self, pyaudio_instance, mocker):
default_mic = choose_mic_default(pyaudio_instance)
mocker.patch.object(sys, "argv", [])
result = choose_mic(pyaudio_instance)
assert result is not None
assert result == default_mic

View File

View File

@@ -0,0 +1,20 @@
import pyaudio
import pytest
from common.microphone_utils import MicrophoneUtils
@pytest.fixture
def pyaudio_instance():
audio = pyaudio.PyAudio()
try:
audio.get_default_input_device_info()
return audio
except IOError:
pytest.skip("No microphone available to test with.")
class TestAudioIntegration(MicrophoneUtils):
"""Run shared audio behavior tests with the mock implementation."""
pass

View File

@@ -0,0 +1,104 @@
# coding=utf-8
import os
import time
import mock
import pytest
import zmq
from robot_interface.endpoints.audio_sender import AudioSender
@pytest.fixture
def zmq_context():
context = zmq.Context()
yield context
def test_no_microphone(zmq_context, mocker):
mock_info_logger = mocker.patch("robot_interface.endpoints.audio_sender.logger.info")
mock_choose_mic = mocker.patch("robot_interface.endpoints.audio_sender.choose_mic")
mock_choose_mic.return_value = None
sender = AudioSender(zmq_context)
assert sender.microphone is None
sender.start()
assert sender.thread is None
mock_info_logger.assert_called()
sender.wait_until_done() # Should return early because we didn't start a thread
def test_unicode_mic_name(zmq_context, mocker):
mocker.patch("robot_interface.endpoints.audio_sender.threading")
mock_choose_mic = mocker.patch("robot_interface.endpoints.audio_sender.choose_mic")
mock_choose_mic.return_value = {"name": u"• Some Unicode name"}
sender = AudioSender(zmq_context)
assert sender.microphone is not None
# `.start()` logs the name of the microphone. It should not give an error if it contains Unicode
# symbols.
sender.start()
assert sender.thread is not None
sender.wait_until_done() # Should return instantly because we didn't start a real thread
def _fake_read(num_frames):
return os.urandom(num_frames * 4)
def test_sending_audio(mocker):
mock_choose_mic = mocker.patch("robot_interface.endpoints.audio_sender.choose_mic")
mock_choose_mic.return_value = {"name": u"Some mic", "index": 0L}
mock_state = mocker.patch("robot_interface.endpoints.audio_sender.state")
mock_state.exit_event.is_set.side_effect = [False, True]
mock_zmq_context = mock.Mock()
send_socket = mock.Mock()
# If there's something wrong with the microphone, it will raise an IOError when `read`ing.
stream = mock.Mock()
stream.read = _fake_read
sender = AudioSender(mock_zmq_context)
sender.socket.send = send_socket
sender.audio.open = mock.Mock()
sender.audio.open.return_value = stream
sender.start()
sender.wait_until_done()
send_socket.assert_called()
def _fake_read_error(num_frames):
raise IOError()
def test_break_microphone(mocker):
mock_choose_mic = mocker.patch("robot_interface.endpoints.audio_sender.choose_mic")
mock_choose_mic.return_value = {"name": u"Some mic", "index": 0L}
mock_state = mocker.patch("robot_interface.endpoints.audio_sender.state")
mock_state.exit_event.is_set.side_effect = [False, True]
mock_zmq_context = mock.Mock()
send_socket = mock.Mock()
# If there's something wrong with the microphone, it will raise an IOError when `read`ing.
stream = mock.Mock()
stream.read = _fake_read_error
sender = AudioSender(mock_zmq_context)
sender.socket.send = send_socket
sender.audio.open = mock.Mock()
sender.audio.open.return_value = stream
sender.start()
sender.wait_until_done()
send_socket.assert_not_called()

View File

@@ -0,0 +1,85 @@
# coding=utf-8
import mock
import pytest
from common.microphone_utils import MicrophoneUtils
from robot_interface.utils.microphone import choose_mic_default, choose_mic_interactive
class MockPyAudio:
def __init__(self):
# You can predefine fake device info here
self.devices = [
{
"index": 0,
"name": u"Someones Microphone", # Using a Unicode character
"maxInputChannels": 2,
"maxOutputChannels": 0,
"defaultSampleRate": 44100.0,
"defaultLowInputLatency": 0.01,
"defaultLowOutputLatency": 0.01,
"defaultHighInputLatency": 0.1,
"defaultHighOutputLatency": 0.1,
"hostApi": 0,
},
{
"index": 1,
"name": u"Mock Speaker 1",
"maxInputChannels": 0,
"maxOutputChannels": 2,
"defaultSampleRate": 48000.0,
"defaultLowInputLatency": 0.01,
"defaultLowOutputLatency": 0.01,
"defaultHighInputLatency": 0.1,
"defaultHighOutputLatency": 0.1,
"hostApi": 0,
},
]
def get_device_count(self):
"""Return the number of available mock devices."""
return len(self.devices)
def get_device_info_by_index(self, index):
"""Return information for a given mock device index."""
if 0 <= index < len(self.devices):
return self.devices[index]
else:
raise IOError("Invalid device index: {}".format(index))
def get_default_input_device_info(self):
"""Return info for a default mock input device."""
for device in self.devices:
if device.get("maxInputChannels", 0) > 0:
return device
raise IOError("No default input device found")
@pytest.fixture
def pyaudio_instance():
return MockPyAudio()
def _raise_io_error():
raise IOError()
class TestAudioUnit(MicrophoneUtils):
"""Run shared audio behavior tests with the mock implementation."""
def test_choose_mic_default_no_mic(self):
mock_pyaudio = mock.Mock()
mock_pyaudio.get_device_count = mock.Mock(return_value=0L)
mock_pyaudio.get_default_input_device_info = _raise_io_error
result = choose_mic_default(mock_pyaudio)
assert result is None
def test_choose_mic_interactive_no_mic(self):
mock_pyaudio = mock.Mock()
mock_pyaudio.get_device_count = mock.Mock(return_value=0L)
mock_pyaudio.get_default_input_device_info = _raise_io_error
result = choose_mic_interactive(mock_pyaudio)
assert result is None