import numpy as np import pytest from control_backend.agents.perception.transcription_agent.speech_recognizer import ( OpenAIWhisperSpeechRecognizer, SpeechRecognizer, ) @pytest.fixture(autouse=True) def patch_sr_settings(monkeypatch): # Patch the *module-local* settings that SpeechRecognizer imported from control_backend.agents.perception.transcription_agent import speech_recognizer as sr # Provide real numbers for everything _estimate_max_tokens() reads monkeypatch.setattr(sr.settings.vad_settings, "sample_rate_hz", 16_000, raising=False) monkeypatch.setattr( sr.settings.behaviour_settings, "transcription_words_per_minute", 450, raising=False ) monkeypatch.setattr( sr.settings.behaviour_settings, "transcription_words_per_token", 0.75, raising=False ) monkeypatch.setattr( sr.settings.behaviour_settings, "transcription_token_buffer", 10, raising=False ) def test_estimate_max_tokens(): """Inputting one minute of audio, assuming 450 words per minute and adding a 10 token padding, expecting 610 tokens.""" audio = np.empty(shape=(60 * 16_000), dtype=np.float32) actual = SpeechRecognizer._estimate_max_tokens(audio) assert actual == 610 assert isinstance(actual, int) def test_get_decode_options(): """Check whether the right decode options are given under different scenarios.""" audio = np.empty(shape=(60 * 16_000), dtype=np.float32) # With the defaults, it should limit output length based on input size recognizer = OpenAIWhisperSpeechRecognizer() options = recognizer._get_decode_options(audio) assert "sample_len" in options assert isinstance(options["sample_len"], int) # When explicitly enabled, it should limit output length based on input size recognizer = OpenAIWhisperSpeechRecognizer(limit_output_length=True) options = recognizer._get_decode_options(audio) assert "sample_len" in options assert isinstance(options["sample_len"], int) # When disabled, it should not limit output length based on input size assert "sample_rate" not in options