pepperplus-cb/test/unit/agents/perception/transcription_agent/test_speech_recognizer.py

import numpy as np
import pytest

from control_backend.agents.perception.transcription_agent.speech_recognizer import (
    OpenAIWhisperSpeechRecognizer,
    SpeechRecognizer,
)


@pytest.fixture(autouse=True)
def patch_sr_settings(monkeypatch):
    # Patch the *module-local* settings that SpeechRecognizer imported
    from control_backend.agents.perception.transcription_agent import speech_recognizer as sr

    # Provide real numbers for everything _estimate_max_tokens() reads
    monkeypatch.setattr(sr.settings.vad_settings, "sample_rate_hz", 16_000, raising=False)
    monkeypatch.setattr(
        sr.settings.behaviour_settings, "transcription_words_per_minute", 450, raising=False
    )
    monkeypatch.setattr(
        sr.settings.behaviour_settings, "transcription_words_per_token", 0.75, raising=False
    )
    monkeypatch.setattr(
        sr.settings.behaviour_settings, "transcription_token_buffer", 10, raising=False
    )


def test_estimate_max_tokens():
    """Inputting one minute of audio, assuming 450 words per minute and adding a 10 token padding,
    expecting 610 tokens."""
    audio = np.empty(shape=(60 * 16_000), dtype=np.float32)

    actual = SpeechRecognizer._estimate_max_tokens(audio)

    assert actual == 610
    assert isinstance(actual, int)


def test_get_decode_options():
    """Check whether the right decode options are given under different scenarios."""
    audio = np.empty(shape=(60 * 16_000), dtype=np.float32)

    # With the defaults, it should limit output length based on input size
    recognizer = OpenAIWhisperSpeechRecognizer()
    options = recognizer._get_decode_options(audio)

    assert "sample_len" in options
    assert isinstance(options["sample_len"], int)

    # When explicitly enabled, it should limit output length based on input size
    recognizer = OpenAIWhisperSpeechRecognizer(limit_output_length=True)
    options = recognizer._get_decode_options(audio)

    assert "sample_len" in options
    assert isinstance(options["sample_len"], int)

    # When disabled, it should not limit output length based on input size
    recognizer = OpenAIWhisperSpeechRecognizer(limit_output_length=False)
    options = recognizer._get_decode_options(audio)
    assert "sample_len" not in options