fix: transcribe remaining buffer when network drops
This commit is contained in:
@@ -12,8 +12,8 @@ LLM_SETTINGS__LOCAL_LLM_MODEL="gpt-oss"
|
|||||||
# Number of non-speech chunks to wait before speech ended. A chunk is approximately 31 ms. Increasing this number allows longer pauses in speech, but also increases response time.
|
# Number of non-speech chunks to wait before speech ended. A chunk is approximately 31 ms. Increasing this number allows longer pauses in speech, but also increases response time.
|
||||||
BEHAVIOUR_SETTINGS__VAD_NON_SPEECH_PATIENCE_CHUNKS=15
|
BEHAVIOUR_SETTINGS__VAD_NON_SPEECH_PATIENCE_CHUNKS=15
|
||||||
|
|
||||||
# Timeout in milliseconds for socket polling. Increase this number if network latency/jitter is high, often the case when using Wi-Fi. Perhaps 500 ms. A symptom of this issue is transcriptions getting cut off.
|
# Timeout in milliseconds for socket polling. Increase this number if network latency/jitter is high, often the case when using Wi-Fi. Perhaps 500 ms or more. A symptom of this issue is transcriptions getting cut off.
|
||||||
BEHAVIOUR_SETTINGS__SOCKET_POLLER_TIMEOUT_MS=100
|
BEHAVIOUR_SETTINGS__SOCKET_POLLER_TIMEOUT_MS=400
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -241,12 +241,23 @@ class VADAgent(BaseAgent):
|
|||||||
self._reset_needed = False
|
self._reset_needed = False
|
||||||
|
|
||||||
assert self.audio_in_poller is not None
|
assert self.audio_in_poller is not None
|
||||||
|
|
||||||
|
non_speech_patience = settings.behaviour_settings.vad_non_speech_patience_chunks
|
||||||
|
begin_silence_length = settings.behaviour_settings.vad_begin_silence_chunks
|
||||||
|
prob_threshold = settings.behaviour_settings.vad_prob_threshold
|
||||||
|
|
||||||
data = await self.audio_in_poller.poll()
|
data = await self.audio_in_poller.poll()
|
||||||
if data is None:
|
if data is None:
|
||||||
if len(self.audio_buffer) > 0:
|
if len(self.audio_buffer) > 0:
|
||||||
self.logger.debug(
|
# Failed to receive new audio. Send remaining buffer to be transcribed.
|
||||||
"No audio data received. Discarding buffer until new data arrives."
|
if len(self.audio_buffer) > begin_silence_length * 512:
|
||||||
)
|
self.logger.debug("Speech ended.")
|
||||||
|
assert self.audio_out_socket is not None
|
||||||
|
await self.audio_out_socket.send(self.audio_buffer[: -2 * 512].tobytes())
|
||||||
|
else:
|
||||||
|
self.logger.debug(
|
||||||
|
"No audio data received. Discarding buffer until new data arrives."
|
||||||
|
)
|
||||||
self.audio_buffer = np.array([], dtype=np.float32)
|
self.audio_buffer = np.array([], dtype=np.float32)
|
||||||
self.i_since_speech = settings.behaviour_settings.vad_initial_since_speech
|
self.i_since_speech = settings.behaviour_settings.vad_initial_since_speech
|
||||||
continue
|
continue
|
||||||
@@ -255,9 +266,6 @@ class VADAgent(BaseAgent):
|
|||||||
chunk = np.frombuffer(data, dtype=np.float32).copy()
|
chunk = np.frombuffer(data, dtype=np.float32).copy()
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
prob = self.model(torch.from_numpy(chunk), settings.vad_settings.sample_rate_hz).item()
|
prob = self.model(torch.from_numpy(chunk), settings.vad_settings.sample_rate_hz).item()
|
||||||
non_speech_patience = settings.behaviour_settings.vad_non_speech_patience_chunks
|
|
||||||
begin_silence_length = settings.behaviour_settings.vad_begin_silence_chunks
|
|
||||||
prob_threshold = settings.behaviour_settings.vad_prob_threshold
|
|
||||||
|
|
||||||
if prob > prob_threshold:
|
if prob > prob_threshold:
|
||||||
if self.i_since_speech > non_speech_patience + begin_silence_length:
|
if self.i_since_speech > non_speech_patience + begin_silence_length:
|
||||||
|
|||||||
Reference in New Issue
Block a user