From 2d2e1dbcdf3713a28fc12703ed3da293504b5a4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Sun, 22 Mar 2026 00:52:11 +0800 Subject: [PATCH] vad --- services/voice_lite_session.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/services/voice_lite_session.py b/services/voice_lite_session.py index c77ff9d..d520261 100644 --- a/services/voice_lite_session.py +++ b/services/voice_lite_session.py @@ -71,6 +71,8 @@ class VoiceLiteSession: self._vad_silence_start: float = 0 # When silence started self._vad_finish_task: Optional[asyncio.Task] = None self._pre_buffer: list = [] # Buffer audio before VAD triggers + self._vad_voice_streak: int = 0 # Consecutive voiced chunks count + self._vad_silence_streak: int = 0 # Consecutive silent chunks count async def start(self) -> None: """Fetch bot config, mark session as running.""" @@ -113,6 +115,8 @@ class VoiceLiteSession: VAD_SOURCE_RATE = 24000 # Input audio sample rate VAD_TARGET_RATE = 16000 # webrtcvad supported sample rate VAD_FRAME_DURATION_MS = 30 # Frame duration for webrtcvad (10, 20, or 30 ms) + VAD_SPEECH_CHUNKS = 3 # Consecutive voiced chunks required to start speech + VAD_SILENCE_CHUNKS = 5 # Consecutive silent chunks required to confirm silence _audio_chunk_count = 0 @@ -164,13 +168,21 @@ class VoiceLiteSession: has_voice = self._webrtcvad_detect(audio_data) now = asyncio.get_event_loop().time() + # Update consecutive streaks + if has_voice: + self._vad_voice_streak += 1 + self._vad_silence_streak = 0 + else: + self._vad_silence_streak += 1 + self._vad_voice_streak = 0 + if has_voice: # Cancel any pending finish if self._vad_finish_task and not self._vad_finish_task.done(): self._vad_finish_task.cancel() self._vad_finish_task = None - if not self._vad_speaking: + if not self._vad_speaking and self._vad_voice_streak >= self.VAD_SPEECH_CHUNKS: # Speech just started — connect ASR self._vad_speaking = True logger.info(f"[VoiceLite] VAD: speech started (webrtcvad), connecting ASR...") @@ -205,8 +217,9 @@ class VoiceLiteSession: if self._vad_silence_start == 0: self._vad_silence_start = now - # Silence exceeded threshold -> send finish - if (now - self._vad_silence_start) >= self.VAD_SILENCE_DURATION: + # Require both consecutive silent chunks AND time threshold + if (self._vad_silence_streak >= self.VAD_SILENCE_CHUNKS + and (now - self._vad_silence_start) >= self.VAD_SILENCE_DURATION): if not self._vad_finish_task or self._vad_finish_task.done(): self._vad_finish_task = asyncio.create_task(self._vad_send_finish()) else: @@ -220,6 +233,8 @@ class VoiceLiteSession: logger.info(f"[VoiceLite] VAD: silence detected, sending finish to ASR") self._vad_speaking = False self._vad_silence_start = 0 + self._vad_voice_streak = 0 + self._vad_silence_streak = 0 if self._asr_client: try: await self._asr_client.send_finish()