增加16khz

This commit is contained in:
朱潮 2026-03-23 17:32:07 +08:00
parent 30dc697071
commit 35a3683439

View File

@ -179,23 +179,32 @@ class VoiceLiteSession:
return audio_data
def _webrtcvad_detect(self, pcm_data: bytes) -> bool:
"""Run webrtcvad on audio data. Returns True if voice is detected in any frame."""
"""Run webrtcvad on audio data. Returns True if voice is detected in any frame.
Expects 24kHz PCM input, resamples to 16kHz for webrtcvad.
"""
resampled = self._resample_24k_to_16k(pcm_data)
return self._webrtcvad_check(resampled)
def _webrtcvad_detect_16k(self, pcm_data: bytes) -> bool:
"""Run webrtcvad directly on 16kHz PCM data (no resampling needed)."""
return self._webrtcvad_check(pcm_data)
def _webrtcvad_check(self, pcm_16k: bytes) -> bool:
"""Core webrtcvad check on 16kHz PCM data."""
frame_size = (self.VAD_TARGET_RATE * self.VAD_FRAME_DURATION_MS // 1000) * 2 # bytes per frame
if len(resampled) < frame_size:
if len(pcm_16k) < frame_size:
return False
# Check frames; return True if any frame has voice
voice_frames = 0
total_frames = 0
for offset in range(0, len(resampled) - frame_size + 1, frame_size):
frame = resampled[offset:offset + frame_size]
for offset in range(0, len(pcm_16k) - frame_size + 1, frame_size):
frame = pcm_16k[offset:offset + frame_size]
total_frames += 1
try:
if self._vad.is_speech(frame, self.VAD_TARGET_RATE):
voice_frames += 1
except Exception:
pass
# Consider voice detected if at least one frame has speech
return voice_frames > 0
async def handle_audio(self, audio_data: bytes) -> None:
@ -203,11 +212,16 @@ class VoiceLiteSession:
if not self._running:
return
# Resample to 24kHz if client sends lower sample rate
audio_data = self._resample_input(audio_data)
# VAD 检测:直接在原始采样率上做,避免双重重采样导致精度损失
if self._client_sample_rate == 16000:
has_voice = self._webrtcvad_detect_16k(audio_data)
else:
has_voice = self._webrtcvad_detect(audio_data)
# 上采样后的音频用于 ASRASR 需要 24kHz
audio_for_asr = self._resample_input(audio_data)
self._audio_chunk_count += 1
has_voice = self._webrtcvad_detect(audio_data)
now = asyncio.get_event_loop().time()
# Update consecutive streaks
@ -242,7 +256,7 @@ class VoiceLiteSession:
# Send current chunk
if self._asr_client:
try:
await self._asr_client.send_audio(audio_data)
await self._asr_client.send_audio(audio_for_asr)
except Exception:
pass
@ -252,7 +266,7 @@ class VoiceLiteSession:
# Brief silence while speaking — keep sending for ASR context
if self._asr_client:
try:
await self._asr_client.send_audio(audio_data)
await self._asr_client.send_audio(audio_for_asr)
except Exception:
pass
@ -266,7 +280,7 @@ class VoiceLiteSession:
self._vad_finish_task = asyncio.create_task(self._vad_send_finish())
else:
# Not speaking — buffer recent audio for pre-speech context
self._pre_buffer.append(audio_data)
self._pre_buffer.append(audio_for_asr)
if len(self._pre_buffer) > self.VAD_PRE_BUFFER_SIZE:
self._pre_buffer.pop(0)
@ -377,6 +391,15 @@ class VoiceLiteSession:
if self._silence_timer_task and not self._silence_timer_task.done():
self._silence_timer_task.cancel()
# Reset VAD state for next utterance
self._vad_speaking = False
self._vad_silence_start = 0
self._vad_voice_streak = 0
self._vad_silence_streak = 0
if self._vad_finish_task and not self._vad_finish_task.done():
self._vad_finish_task.cancel()
self._vad_finish_task = None
# Interrupt any in-progress agent+TTS
await self._interrupt_current()