增加16khz
This commit is contained in:
parent
30dc697071
commit
35a3683439
@ -179,23 +179,32 @@ class VoiceLiteSession:
|
|||||||
return audio_data
|
return audio_data
|
||||||
|
|
||||||
def _webrtcvad_detect(self, pcm_data: bytes) -> bool:
|
def _webrtcvad_detect(self, pcm_data: bytes) -> bool:
|
||||||
"""Run webrtcvad on audio data. Returns True if voice is detected in any frame."""
|
"""Run webrtcvad on audio data. Returns True if voice is detected in any frame.
|
||||||
|
|
||||||
|
Expects 24kHz PCM input, resamples to 16kHz for webrtcvad.
|
||||||
|
"""
|
||||||
resampled = self._resample_24k_to_16k(pcm_data)
|
resampled = self._resample_24k_to_16k(pcm_data)
|
||||||
|
return self._webrtcvad_check(resampled)
|
||||||
|
|
||||||
|
def _webrtcvad_detect_16k(self, pcm_data: bytes) -> bool:
|
||||||
|
"""Run webrtcvad directly on 16kHz PCM data (no resampling needed)."""
|
||||||
|
return self._webrtcvad_check(pcm_data)
|
||||||
|
|
||||||
|
def _webrtcvad_check(self, pcm_16k: bytes) -> bool:
|
||||||
|
"""Core webrtcvad check on 16kHz PCM data."""
|
||||||
frame_size = (self.VAD_TARGET_RATE * self.VAD_FRAME_DURATION_MS // 1000) * 2 # bytes per frame
|
frame_size = (self.VAD_TARGET_RATE * self.VAD_FRAME_DURATION_MS // 1000) * 2 # bytes per frame
|
||||||
if len(resampled) < frame_size:
|
if len(pcm_16k) < frame_size:
|
||||||
return False
|
return False
|
||||||
# Check frames; return True if any frame has voice
|
|
||||||
voice_frames = 0
|
voice_frames = 0
|
||||||
total_frames = 0
|
total_frames = 0
|
||||||
for offset in range(0, len(resampled) - frame_size + 1, frame_size):
|
for offset in range(0, len(pcm_16k) - frame_size + 1, frame_size):
|
||||||
frame = resampled[offset:offset + frame_size]
|
frame = pcm_16k[offset:offset + frame_size]
|
||||||
total_frames += 1
|
total_frames += 1
|
||||||
try:
|
try:
|
||||||
if self._vad.is_speech(frame, self.VAD_TARGET_RATE):
|
if self._vad.is_speech(frame, self.VAD_TARGET_RATE):
|
||||||
voice_frames += 1
|
voice_frames += 1
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
# Consider voice detected if at least one frame has speech
|
|
||||||
return voice_frames > 0
|
return voice_frames > 0
|
||||||
|
|
||||||
async def handle_audio(self, audio_data: bytes) -> None:
|
async def handle_audio(self, audio_data: bytes) -> None:
|
||||||
@ -203,11 +212,16 @@ class VoiceLiteSession:
|
|||||||
if not self._running:
|
if not self._running:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Resample to 24kHz if client sends lower sample rate
|
# VAD 检测:直接在原始采样率上做,避免双重重采样导致精度损失
|
||||||
audio_data = self._resample_input(audio_data)
|
if self._client_sample_rate == 16000:
|
||||||
|
has_voice = self._webrtcvad_detect_16k(audio_data)
|
||||||
|
else:
|
||||||
|
has_voice = self._webrtcvad_detect(audio_data)
|
||||||
|
|
||||||
|
# 上采样后的音频用于 ASR(ASR 需要 24kHz)
|
||||||
|
audio_for_asr = self._resample_input(audio_data)
|
||||||
|
|
||||||
self._audio_chunk_count += 1
|
self._audio_chunk_count += 1
|
||||||
has_voice = self._webrtcvad_detect(audio_data)
|
|
||||||
now = asyncio.get_event_loop().time()
|
now = asyncio.get_event_loop().time()
|
||||||
|
|
||||||
# Update consecutive streaks
|
# Update consecutive streaks
|
||||||
@ -242,7 +256,7 @@ class VoiceLiteSession:
|
|||||||
# Send current chunk
|
# Send current chunk
|
||||||
if self._asr_client:
|
if self._asr_client:
|
||||||
try:
|
try:
|
||||||
await self._asr_client.send_audio(audio_data)
|
await self._asr_client.send_audio(audio_for_asr)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -252,7 +266,7 @@ class VoiceLiteSession:
|
|||||||
# Brief silence while speaking — keep sending for ASR context
|
# Brief silence while speaking — keep sending for ASR context
|
||||||
if self._asr_client:
|
if self._asr_client:
|
||||||
try:
|
try:
|
||||||
await self._asr_client.send_audio(audio_data)
|
await self._asr_client.send_audio(audio_for_asr)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -266,7 +280,7 @@ class VoiceLiteSession:
|
|||||||
self._vad_finish_task = asyncio.create_task(self._vad_send_finish())
|
self._vad_finish_task = asyncio.create_task(self._vad_send_finish())
|
||||||
else:
|
else:
|
||||||
# Not speaking — buffer recent audio for pre-speech context
|
# Not speaking — buffer recent audio for pre-speech context
|
||||||
self._pre_buffer.append(audio_data)
|
self._pre_buffer.append(audio_for_asr)
|
||||||
if len(self._pre_buffer) > self.VAD_PRE_BUFFER_SIZE:
|
if len(self._pre_buffer) > self.VAD_PRE_BUFFER_SIZE:
|
||||||
self._pre_buffer.pop(0)
|
self._pre_buffer.pop(0)
|
||||||
|
|
||||||
@ -377,6 +391,15 @@ class VoiceLiteSession:
|
|||||||
if self._silence_timer_task and not self._silence_timer_task.done():
|
if self._silence_timer_task and not self._silence_timer_task.done():
|
||||||
self._silence_timer_task.cancel()
|
self._silence_timer_task.cancel()
|
||||||
|
|
||||||
|
# Reset VAD state for next utterance
|
||||||
|
self._vad_speaking = False
|
||||||
|
self._vad_silence_start = 0
|
||||||
|
self._vad_voice_streak = 0
|
||||||
|
self._vad_silence_streak = 0
|
||||||
|
if self._vad_finish_task and not self._vad_finish_task.done():
|
||||||
|
self._vad_finish_task.cancel()
|
||||||
|
self._vad_finish_task = None
|
||||||
|
|
||||||
# Interrupt any in-progress agent+TTS
|
# Interrupt any in-progress agent+TTS
|
||||||
await self._interrupt_current()
|
await self._interrupt_current()
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user