feat: prepend more silence before speech audio for better transcription beginnings
ref: N25B-429
This commit is contained in:
@@ -10,7 +10,7 @@ LLM_SETTINGS__LOCAL_LLM_URL="http://localhost:1234/v1/chat/completions"
|
||||
LLM_SETTINGS__LOCAL_LLM_MODEL="gpt-oss"
|
||||
|
||||
# Number of non-speech chunks to wait before speech ended. A chunk is approximately 31 ms. Increasing this number allows longer pauses in speech, but also increases response time.
|
||||
BEHAVIOUR_SETTINGS__VAD_NON_SPEECH_PATIENCE_CHUNKS=3
|
||||
BEHAVIOUR_SETTINGS__VAD_NON_SPEECH_PATIENCE_CHUNKS=15
|
||||
|
||||
# Timeout in milliseconds for socket polling. Increase this number if network latency/jitter is high, often the case when using Wi-Fi. Perhaps 500 ms. A symptom of this issue is transcriptions getting cut off.
|
||||
BEHAVIOUR_SETTINGS__SOCKET_POLLER_TIMEOUT_MS=100
|
||||
|
||||
@@ -229,10 +229,11 @@ class VADAgent(BaseAgent):
|
||||
assert self.model is not None
|
||||
prob = self.model(torch.from_numpy(chunk), settings.vad_settings.sample_rate_hz).item()
|
||||
non_speech_patience = settings.behaviour_settings.vad_non_speech_patience_chunks
|
||||
begin_silence_length = settings.behaviour_settings.vad_begin_silence_chunks
|
||||
prob_threshold = settings.behaviour_settings.vad_prob_threshold
|
||||
|
||||
if prob > prob_threshold:
|
||||
if self.i_since_speech > non_speech_patience:
|
||||
if self.i_since_speech > non_speech_patience + begin_silence_length:
|
||||
self.logger.debug("Speech started.")
|
||||
self.audio_buffer = np.append(self.audio_buffer, chunk)
|
||||
self.i_since_speech = 0
|
||||
@@ -246,11 +247,12 @@ class VADAgent(BaseAgent):
|
||||
continue
|
||||
|
||||
# Speech probably ended. Make sure we have a usable amount of data.
|
||||
if len(self.audio_buffer) >= 3 * len(chunk):
|
||||
if len(self.audio_buffer) > begin_silence_length * len(chunk):
|
||||
self.logger.debug("Speech ended.")
|
||||
assert self.audio_out_socket is not None
|
||||
await self.audio_out_socket.send(self.audio_buffer[: -2 * len(chunk)].tobytes())
|
||||
|
||||
# At this point, we know that the speech has ended.
|
||||
# Prepend the last chunk that had no speech, for a more fluent boundary
|
||||
self.audio_buffer = chunk
|
||||
# At this point, we know that there is no speech.
|
||||
# Prepend the last few chunks that had no speech, for a more fluent boundary.
|
||||
self.audio_buffer = np.append(self.audio_buffer, chunk)
|
||||
self.audio_buffer = self.audio_buffer[-begin_silence_length * len(chunk) :]
|
||||
|
||||
@@ -73,6 +73,7 @@ class BehaviourSettings(BaseModel):
|
||||
:ivar vad_prob_threshold: Probability threshold for Voice Activity Detection.
|
||||
:ivar vad_initial_since_speech: Initial value for 'since speech' counter in VAD.
|
||||
:ivar vad_non_speech_patience_chunks: Number of non-speech chunks to wait before speech ended.
|
||||
:ivar vad_begin_silence_chunks: The number of chunks of silence to prepend to speech chunks.
|
||||
:ivar transcription_max_concurrent_tasks: Maximum number of concurrent transcription tasks.
|
||||
:ivar transcription_words_per_minute: Estimated words per minute for transcription timing.
|
||||
:ivar transcription_words_per_token: Estimated words per token for transcription timing.
|
||||
@@ -90,6 +91,7 @@ class BehaviourSettings(BaseModel):
|
||||
vad_prob_threshold: float = 0.5
|
||||
vad_initial_since_speech: int = 100
|
||||
vad_non_speech_patience_chunks: int = 15
|
||||
vad_begin_silence_chunks: int = 3
|
||||
|
||||
# transcription behaviour
|
||||
transcription_max_concurrent_tasks: int = 3
|
||||
|
||||
Reference in New Issue
Block a user