This commit is contained in:
朱潮 2026-03-22 00:52:11 +08:00
parent 7a547322e3
commit 2d2e1dbcdf

View File

@ -71,6 +71,8 @@ class VoiceLiteSession:
self._vad_silence_start: float = 0 # When silence started self._vad_silence_start: float = 0 # When silence started
self._vad_finish_task: Optional[asyncio.Task] = None self._vad_finish_task: Optional[asyncio.Task] = None
self._pre_buffer: list = [] # Buffer audio before VAD triggers self._pre_buffer: list = [] # Buffer audio before VAD triggers
self._vad_voice_streak: int = 0 # Consecutive voiced chunks count
self._vad_silence_streak: int = 0 # Consecutive silent chunks count
async def start(self) -> None: async def start(self) -> None:
"""Fetch bot config, mark session as running.""" """Fetch bot config, mark session as running."""
@ -113,6 +115,8 @@ class VoiceLiteSession:
VAD_SOURCE_RATE = 24000 # Input audio sample rate VAD_SOURCE_RATE = 24000 # Input audio sample rate
VAD_TARGET_RATE = 16000 # webrtcvad supported sample rate VAD_TARGET_RATE = 16000 # webrtcvad supported sample rate
VAD_FRAME_DURATION_MS = 30 # Frame duration for webrtcvad (10, 20, or 30 ms) VAD_FRAME_DURATION_MS = 30 # Frame duration for webrtcvad (10, 20, or 30 ms)
VAD_SPEECH_CHUNKS = 3 # Consecutive voiced chunks required to start speech
VAD_SILENCE_CHUNKS = 5 # Consecutive silent chunks required to confirm silence
_audio_chunk_count = 0 _audio_chunk_count = 0
@ -164,13 +168,21 @@ class VoiceLiteSession:
has_voice = self._webrtcvad_detect(audio_data) has_voice = self._webrtcvad_detect(audio_data)
now = asyncio.get_event_loop().time() now = asyncio.get_event_loop().time()
# Update consecutive streaks
if has_voice:
self._vad_voice_streak += 1
self._vad_silence_streak = 0
else:
self._vad_silence_streak += 1
self._vad_voice_streak = 0
if has_voice: if has_voice:
# Cancel any pending finish # Cancel any pending finish
if self._vad_finish_task and not self._vad_finish_task.done(): if self._vad_finish_task and not self._vad_finish_task.done():
self._vad_finish_task.cancel() self._vad_finish_task.cancel()
self._vad_finish_task = None self._vad_finish_task = None
if not self._vad_speaking: if not self._vad_speaking and self._vad_voice_streak >= self.VAD_SPEECH_CHUNKS:
# Speech just started — connect ASR # Speech just started — connect ASR
self._vad_speaking = True self._vad_speaking = True
logger.info(f"[VoiceLite] VAD: speech started (webrtcvad), connecting ASR...") logger.info(f"[VoiceLite] VAD: speech started (webrtcvad), connecting ASR...")
@ -205,8 +217,9 @@ class VoiceLiteSession:
if self._vad_silence_start == 0: if self._vad_silence_start == 0:
self._vad_silence_start = now self._vad_silence_start = now
# Silence exceeded threshold -> send finish # Require both consecutive silent chunks AND time threshold
if (now - self._vad_silence_start) >= self.VAD_SILENCE_DURATION: if (self._vad_silence_streak >= self.VAD_SILENCE_CHUNKS
and (now - self._vad_silence_start) >= self.VAD_SILENCE_DURATION):
if not self._vad_finish_task or self._vad_finish_task.done(): if not self._vad_finish_task or self._vad_finish_task.done():
self._vad_finish_task = asyncio.create_task(self._vad_send_finish()) self._vad_finish_task = asyncio.create_task(self._vad_send_finish())
else: else:
@ -220,6 +233,8 @@ class VoiceLiteSession:
logger.info(f"[VoiceLite] VAD: silence detected, sending finish to ASR") logger.info(f"[VoiceLite] VAD: silence detected, sending finish to ASR")
self._vad_speaking = False self._vad_speaking = False
self._vad_silence_start = 0 self._vad_silence_start = 0
self._vad_voice_streak = 0
self._vad_silence_streak = 0
if self._asr_client: if self._asr_client:
try: try:
await self._asr_client.send_finish() await self._asr_client.send_finish()