feat: prepend more silence before speech audio for better transcription beginnings

ref: N25B-429
2026-01-08 10:49:13 +01:00
parent 5a61225c6f
commit 45719c580b
3 changed files with 10 additions and 6 deletions
--- a/src/control_backend/core/config.py
+++ b/src/control_backend/core/config.py
@@ -73,6 +73,7 @@ class BehaviourSettings(BaseModel):
    :ivar vad_prob_threshold: Probability threshold for Voice Activity Detection.
    :ivar vad_initial_since_speech: Initial value for 'since speech' counter in VAD.
    :ivar vad_non_speech_patience_chunks: Number of non-speech chunks to wait before speech ended.
+    :ivar vad_begin_silence_chunks: The number of chunks of silence to prepend to speech chunks.
    :ivar transcription_max_concurrent_tasks: Maximum number of concurrent transcription tasks.
    :ivar transcription_words_per_minute: Estimated words per minute for transcription timing.
    :ivar transcription_words_per_token: Estimated words per token for transcription timing.
@@ -90,6 +91,7 @@ class BehaviourSettings(BaseModel):
    vad_prob_threshold: float = 0.5
    vad_initial_since_speech: int = 100
    vad_non_speech_patience_chunks: int = 15
+    vad_begin_silence_chunks: int = 3

    # transcription behaviour
    transcription_max_concurrent_tasks: int = 3