vad
This commit is contained in:
parent
7a547322e3
commit
2d2e1dbcdf
@ -71,6 +71,8 @@ class VoiceLiteSession:
|
|||||||
self._vad_silence_start: float = 0 # When silence started
|
self._vad_silence_start: float = 0 # When silence started
|
||||||
self._vad_finish_task: Optional[asyncio.Task] = None
|
self._vad_finish_task: Optional[asyncio.Task] = None
|
||||||
self._pre_buffer: list = [] # Buffer audio before VAD triggers
|
self._pre_buffer: list = [] # Buffer audio before VAD triggers
|
||||||
|
self._vad_voice_streak: int = 0 # Consecutive voiced chunks count
|
||||||
|
self._vad_silence_streak: int = 0 # Consecutive silent chunks count
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
"""Fetch bot config, mark session as running."""
|
"""Fetch bot config, mark session as running."""
|
||||||
@ -113,6 +115,8 @@ class VoiceLiteSession:
|
|||||||
VAD_SOURCE_RATE = 24000 # Input audio sample rate
|
VAD_SOURCE_RATE = 24000 # Input audio sample rate
|
||||||
VAD_TARGET_RATE = 16000 # webrtcvad supported sample rate
|
VAD_TARGET_RATE = 16000 # webrtcvad supported sample rate
|
||||||
VAD_FRAME_DURATION_MS = 30 # Frame duration for webrtcvad (10, 20, or 30 ms)
|
VAD_FRAME_DURATION_MS = 30 # Frame duration for webrtcvad (10, 20, or 30 ms)
|
||||||
|
VAD_SPEECH_CHUNKS = 3 # Consecutive voiced chunks required to start speech
|
||||||
|
VAD_SILENCE_CHUNKS = 5 # Consecutive silent chunks required to confirm silence
|
||||||
|
|
||||||
_audio_chunk_count = 0
|
_audio_chunk_count = 0
|
||||||
|
|
||||||
@ -164,13 +168,21 @@ class VoiceLiteSession:
|
|||||||
has_voice = self._webrtcvad_detect(audio_data)
|
has_voice = self._webrtcvad_detect(audio_data)
|
||||||
now = asyncio.get_event_loop().time()
|
now = asyncio.get_event_loop().time()
|
||||||
|
|
||||||
|
# Update consecutive streaks
|
||||||
|
if has_voice:
|
||||||
|
self._vad_voice_streak += 1
|
||||||
|
self._vad_silence_streak = 0
|
||||||
|
else:
|
||||||
|
self._vad_silence_streak += 1
|
||||||
|
self._vad_voice_streak = 0
|
||||||
|
|
||||||
if has_voice:
|
if has_voice:
|
||||||
# Cancel any pending finish
|
# Cancel any pending finish
|
||||||
if self._vad_finish_task and not self._vad_finish_task.done():
|
if self._vad_finish_task and not self._vad_finish_task.done():
|
||||||
self._vad_finish_task.cancel()
|
self._vad_finish_task.cancel()
|
||||||
self._vad_finish_task = None
|
self._vad_finish_task = None
|
||||||
|
|
||||||
if not self._vad_speaking:
|
if not self._vad_speaking and self._vad_voice_streak >= self.VAD_SPEECH_CHUNKS:
|
||||||
# Speech just started — connect ASR
|
# Speech just started — connect ASR
|
||||||
self._vad_speaking = True
|
self._vad_speaking = True
|
||||||
logger.info(f"[VoiceLite] VAD: speech started (webrtcvad), connecting ASR...")
|
logger.info(f"[VoiceLite] VAD: speech started (webrtcvad), connecting ASR...")
|
||||||
@ -205,8 +217,9 @@ class VoiceLiteSession:
|
|||||||
if self._vad_silence_start == 0:
|
if self._vad_silence_start == 0:
|
||||||
self._vad_silence_start = now
|
self._vad_silence_start = now
|
||||||
|
|
||||||
# Silence exceeded threshold -> send finish
|
# Require both consecutive silent chunks AND time threshold
|
||||||
if (now - self._vad_silence_start) >= self.VAD_SILENCE_DURATION:
|
if (self._vad_silence_streak >= self.VAD_SILENCE_CHUNKS
|
||||||
|
and (now - self._vad_silence_start) >= self.VAD_SILENCE_DURATION):
|
||||||
if not self._vad_finish_task or self._vad_finish_task.done():
|
if not self._vad_finish_task or self._vad_finish_task.done():
|
||||||
self._vad_finish_task = asyncio.create_task(self._vad_send_finish())
|
self._vad_finish_task = asyncio.create_task(self._vad_send_finish())
|
||||||
else:
|
else:
|
||||||
@ -220,6 +233,8 @@ class VoiceLiteSession:
|
|||||||
logger.info(f"[VoiceLite] VAD: silence detected, sending finish to ASR")
|
logger.info(f"[VoiceLite] VAD: silence detected, sending finish to ASR")
|
||||||
self._vad_speaking = False
|
self._vad_speaking = False
|
||||||
self._vad_silence_start = 0
|
self._vad_silence_start = 0
|
||||||
|
self._vad_voice_streak = 0
|
||||||
|
self._vad_silence_streak = 0
|
||||||
if self._asr_client:
|
if self._asr_client:
|
||||||
try:
|
try:
|
||||||
await self._asr_client.send_finish()
|
await self._asr_client.send_finish()
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user