pepperplus-cb/test/unit/agents/transcription/test_speech_recognizer.py

import numpy as np

from control_backend.agents.transcription import SpeechRecognizer
from control_backend.agents.transcription.speech_recognizer import OpenAIWhisperSpeechRecognizer


def test_estimate_max_tokens():
    """Inputting one minute of audio, assuming 300 words per minute, expecting 400 tokens."""
    audio = np.empty(shape=(60*16_000), dtype=np.float32)

    actual = SpeechRecognizer._estimate_max_tokens(audio)

    assert actual == 400
    assert isinstance(actual, int)


def test_get_decode_options():
    """Check whether the right decode options are given under different scenarios."""
    audio = np.empty(shape=(60*16_000), dtype=np.float32)

    # With the defaults, it should limit output length based on input size
    recognizer = OpenAIWhisperSpeechRecognizer()
    options = recognizer._get_decode_options(audio)

    assert "sample_len" in options
    assert isinstance(options["sample_len"], int)

    # When explicitly enabled, it should limit output length based on input size
    recognizer = OpenAIWhisperSpeechRecognizer(limit_output_length=True)
    options = recognizer._get_decode_options(audio)

    assert "sample_len" in options
    assert isinstance(options["sample_len"], int)

    # When disabled, it should not limit output length based on input size
    assert "sample_rate" not in options