vad

2026-03-22 00:52:11 +08:00 · 2026-03-22 00:52:11 +08:00 · 2d2e1dbcdf
commit 2d2e1dbcdf
parent 7a547322e3
1 changed files with 18 additions and 3 deletions
--- a/services/voice_lite_session.py
+++ b/services/voice_lite_session.py
@ -71,6 +71,8 @@ class VoiceLiteSession:
        self._vad_silence_start: float = 0  # When silence started
        self._vad_finish_task: Optional[asyncio.Task] = None
        self._pre_buffer: list = []  # Buffer audio before VAD triggers
+        self._vad_voice_streak: int = 0   # Consecutive voiced chunks count
+        self._vad_silence_streak: int = 0  # Consecutive silent chunks count

    async def start(self) -> None:
        """Fetch bot config, mark session as running."""
@ -113,6 +115,8 @@ class VoiceLiteSession:
    VAD_SOURCE_RATE = 24000     # Input audio sample rate
    VAD_TARGET_RATE = 16000     # webrtcvad supported sample rate
    VAD_FRAME_DURATION_MS = 30  # Frame duration for webrtcvad (10, 20, or 30 ms)
+    VAD_SPEECH_CHUNKS = 3       # Consecutive voiced chunks required to start speech
+    VAD_SILENCE_CHUNKS = 5      # Consecutive silent chunks required to confirm silence

    _audio_chunk_count = 0

@ -164,13 +168,21 @@ class VoiceLiteSession:
        has_voice = self._webrtcvad_detect(audio_data)
        now = asyncio.get_event_loop().time()

+        # Update consecutive streaks
+        if has_voice:
+            self._vad_voice_streak += 1
+            self._vad_silence_streak = 0
+        else:
+            self._vad_silence_streak += 1
+            self._vad_voice_streak = 0
+
        if has_voice:
            # Cancel any pending finish
            if self._vad_finish_task and not self._vad_finish_task.done():
                self._vad_finish_task.cancel()
                self._vad_finish_task = None

-            if not self._vad_speaking:
+            if not self._vad_speaking and self._vad_voice_streak >= self.VAD_SPEECH_CHUNKS:
                # Speech just started — connect ASR
                self._vad_speaking = True
                logger.info(f"[VoiceLite] VAD: speech started (webrtcvad), connecting ASR...")
@ -205,8 +217,9 @@ class VoiceLiteSession:
                if self._vad_silence_start == 0:
                    self._vad_silence_start = now

-                # Silence exceeded threshold -> send finish
-                if (now - self._vad_silence_start) >= self.VAD_SILENCE_DURATION:
+                # Require both consecutive silent chunks AND time threshold
+                if (self._vad_silence_streak >= self.VAD_SILENCE_CHUNKS
+                        and (now - self._vad_silence_start) >= self.VAD_SILENCE_DURATION):
                    if not self._vad_finish_task or self._vad_finish_task.done():
                        self._vad_finish_task = asyncio.create_task(self._vad_send_finish())
            else:
@ -220,6 +233,8 @@ class VoiceLiteSession:
        logger.info(f"[VoiceLite] VAD: silence detected, sending finish to ASR")
        self._vad_speaking = False
        self._vad_silence_start = 0
+        self._vad_voice_streak = 0
+        self._vad_silence_streak = 0
        if self._asr_client:
            try:
                await self._asr_client.send_finish()