From 35a368343929c84331ff10e22b54990441af9b08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Mon, 23 Mar 2026 17:32:07 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A016khz?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- services/voice_lite_session.py | 47 +++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/services/voice_lite_session.py b/services/voice_lite_session.py index 4621e78..c3db322 100644 --- a/services/voice_lite_session.py +++ b/services/voice_lite_session.py @@ -179,23 +179,32 @@ class VoiceLiteSession: return audio_data def _webrtcvad_detect(self, pcm_data: bytes) -> bool: - """Run webrtcvad on audio data. Returns True if voice is detected in any frame.""" + """Run webrtcvad on audio data. Returns True if voice is detected in any frame. + + Expects 24kHz PCM input, resamples to 16kHz for webrtcvad. + """ resampled = self._resample_24k_to_16k(pcm_data) + return self._webrtcvad_check(resampled) + + def _webrtcvad_detect_16k(self, pcm_data: bytes) -> bool: + """Run webrtcvad directly on 16kHz PCM data (no resampling needed).""" + return self._webrtcvad_check(pcm_data) + + def _webrtcvad_check(self, pcm_16k: bytes) -> bool: + """Core webrtcvad check on 16kHz PCM data.""" frame_size = (self.VAD_TARGET_RATE * self.VAD_FRAME_DURATION_MS // 1000) * 2 # bytes per frame - if len(resampled) < frame_size: + if len(pcm_16k) < frame_size: return False - # Check frames; return True if any frame has voice voice_frames = 0 total_frames = 0 - for offset in range(0, len(resampled) - frame_size + 1, frame_size): - frame = resampled[offset:offset + frame_size] + for offset in range(0, len(pcm_16k) - frame_size + 1, frame_size): + frame = pcm_16k[offset:offset + frame_size] total_frames += 1 try: if self._vad.is_speech(frame, self.VAD_TARGET_RATE): voice_frames += 1 except Exception: pass - # Consider voice detected if at least one frame has speech return voice_frames > 0 async def handle_audio(self, audio_data: bytes) -> None: @@ -203,11 +212,16 @@ class VoiceLiteSession: if not self._running: return - # Resample to 24kHz if client sends lower sample rate - audio_data = self._resample_input(audio_data) + # VAD 检测:直接在原始采样率上做,避免双重重采样导致精度损失 + if self._client_sample_rate == 16000: + has_voice = self._webrtcvad_detect_16k(audio_data) + else: + has_voice = self._webrtcvad_detect(audio_data) + + # 上采样后的音频用于 ASR(ASR 需要 24kHz) + audio_for_asr = self._resample_input(audio_data) self._audio_chunk_count += 1 - has_voice = self._webrtcvad_detect(audio_data) now = asyncio.get_event_loop().time() # Update consecutive streaks @@ -242,7 +256,7 @@ class VoiceLiteSession: # Send current chunk if self._asr_client: try: - await self._asr_client.send_audio(audio_data) + await self._asr_client.send_audio(audio_for_asr) except Exception: pass @@ -252,7 +266,7 @@ class VoiceLiteSession: # Brief silence while speaking — keep sending for ASR context if self._asr_client: try: - await self._asr_client.send_audio(audio_data) + await self._asr_client.send_audio(audio_for_asr) except Exception: pass @@ -266,7 +280,7 @@ class VoiceLiteSession: self._vad_finish_task = asyncio.create_task(self._vad_send_finish()) else: # Not speaking — buffer recent audio for pre-speech context - self._pre_buffer.append(audio_data) + self._pre_buffer.append(audio_for_asr) if len(self._pre_buffer) > self.VAD_PRE_BUFFER_SIZE: self._pre_buffer.pop(0) @@ -377,6 +391,15 @@ class VoiceLiteSession: if self._silence_timer_task and not self._silence_timer_task.done(): self._silence_timer_task.cancel() + # Reset VAD state for next utterance + self._vad_speaking = False + self._vad_silence_start = 0 + self._vad_voice_streak = 0 + self._vad_silence_streak = 0 + if self._vad_finish_task and not self._vad_finish_task.done(): + self._vad_finish_task.cancel() + self._vad_finish_task = None + # Interrupt any in-progress agent+TTS await self._interrupt_current()