Merge branch 'dev' into refactor/logging

2025-11-05 15:09:14 +01:00
parent 220c5c7739 4d38850a1d
commit d9fef22090
11 changed files with 153 additions and 51 deletions
--- a/src/control_backend/agents/transcription/speech_recognizer.py
+++ b/src/control_backend/agents/transcription/speech_recognizer.py
@@ -36,16 +36,16 @@ class SpeechRecognizer(abc.ABC):
    def _estimate_max_tokens(audio: np.ndarray) -> int:
        """
        Estimate the maximum length of a given audio sample in tokens. Assumes a maximum speaking
-        rate of 300 words per minute (2x average), and assumes that 3 words is 4 tokens.
+        rate of 450 words per minute (3x average), and assumes that 3 words is 4 tokens.

        :param audio: The audio sample (16 kHz) to use for length estimation.
        :return: The estimated length of the transcribed audio in tokens.
        """
        length_seconds = len(audio) / 16_000
        length_minutes = length_seconds / 60
-        word_count = length_minutes * 300
+        word_count = length_minutes * 450
        token_count = word_count / 3 * 4
-        return int(token_count)
+        return int(token_count) + 10

    def _get_decode_options(self, audio: np.ndarray) -> dict:
        """
@@ -85,9 +85,10 @@ class MLXWhisperSpeechRecognizer(SpeechRecognizer):
    def recognize_speech(self, audio: np.ndarray) -> str:
        self.load_model()
        return mlx_whisper.transcribe(
-            audio, path_or_hf_repo=self.model_name, decode_options=self._get_decode_options(audio)
-        )["text"]
-        return mlx_whisper.transcribe(audio, path_or_hf_repo=self.model_name)["text"].strip()
+            audio,
+            path_or_hf_repo=self.model_name,
+            **self._get_decode_options(audio),
+        )["text"].strip()


 class OpenAIWhisperSpeechRecognizer(SpeechRecognizer):
@@ -103,6 +104,4 @@ class OpenAIWhisperSpeechRecognizer(SpeechRecognizer):

    def recognize_speech(self, audio: np.ndarray) -> str:
        self.load_model()
-        return whisper.transcribe(
-            self.model, audio, decode_options=self._get_decode_options(audio)
-        )["text"]
+        return whisper.transcribe(self.model, audio, **self._get_decode_options(audio))["text"]
--- a/src/control_backend/agents/transcription/transcription_agent.py
+++ b/src/control_backend/agents/transcription/transcription_agent.py
@@ -56,6 +56,10 @@ class TranscriptionAgent(BaseAgent):
            audio = await self.audio_in_socket.recv()
            audio = np.frombuffer(audio, dtype=np.float32)
            speech = await self._transcribe(audio)
+            if not speech:
+                self.agent.logger.info("Nothing transcribed.")
+                return
+
            self.agent.logger.info("Transcribed speech: %s", speech)

            await self._share_transcription(speech)