From 35a368343929c84331ff10e22b54990441af9b08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= <zhuchaowe@users.noreply.github.com>
Date: Mon, 23 Mar 2026 17:32:07 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A016khz?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 services/voice_lite_session.py | 47 +++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/services/voice_lite_session.py b/services/voice_lite_session.py
index 4621e78..c3db322 100644
--- a/services/voice_lite_session.py
+++ b/services/voice_lite_session.py
@@ -179,23 +179,32 @@ class VoiceLiteSession:
         return audio_data
 
     def _webrtcvad_detect(self, pcm_data: bytes) -> bool:
-        """Run webrtcvad on audio data. Returns True if voice is detected in any frame."""
+        """Run webrtcvad on audio data. Returns True if voice is detected in any frame.
+
+        Expects 24kHz PCM input, resamples to 16kHz for webrtcvad.
+        """
         resampled = self._resample_24k_to_16k(pcm_data)
+        return self._webrtcvad_check(resampled)
+
+    def _webrtcvad_detect_16k(self, pcm_data: bytes) -> bool:
+        """Run webrtcvad directly on 16kHz PCM data (no resampling needed)."""
+        return self._webrtcvad_check(pcm_data)
+
+    def _webrtcvad_check(self, pcm_16k: bytes) -> bool:
+        """Core webrtcvad check on 16kHz PCM data."""
         frame_size = (self.VAD_TARGET_RATE * self.VAD_FRAME_DURATION_MS // 1000) * 2  # bytes per frame
-        if len(resampled) < frame_size:
+        if len(pcm_16k) < frame_size:
             return False
-        # Check frames; return True if any frame has voice
         voice_frames = 0
         total_frames = 0
-        for offset in range(0, len(resampled) - frame_size + 1, frame_size):
-            frame = resampled[offset:offset + frame_size]
+        for offset in range(0, len(pcm_16k) - frame_size + 1, frame_size):
+            frame = pcm_16k[offset:offset + frame_size]
             total_frames += 1
             try:
                 if self._vad.is_speech(frame, self.VAD_TARGET_RATE):
                     voice_frames += 1
             except Exception:
                 pass
-        # Consider voice detected if at least one frame has speech
         return voice_frames > 0
 
     async def handle_audio(self, audio_data: bytes) -> None:
@@ -203,11 +212,16 @@ class VoiceLiteSession:
         if not self._running:
             return
 
-        # Resample to 24kHz if client sends lower sample rate
-        audio_data = self._resample_input(audio_data)
+        # VAD 检测：直接在原始采样率上做，避免双重重采样导致精度损失
+        if self._client_sample_rate == 16000:
+            has_voice = self._webrtcvad_detect_16k(audio_data)
+        else:
+            has_voice = self._webrtcvad_detect(audio_data)
+
+        # 上采样后的音频用于 ASR（ASR 需要 24kHz）
+        audio_for_asr = self._resample_input(audio_data)
 
         self._audio_chunk_count += 1
-        has_voice = self._webrtcvad_detect(audio_data)
         now = asyncio.get_event_loop().time()
 
         # Update consecutive streaks
@@ -242,7 +256,7 @@ class VoiceLiteSession:
             # Send current chunk
             if self._asr_client:
                 try:
-                    await self._asr_client.send_audio(audio_data)
+                    await self._asr_client.send_audio(audio_for_asr)
                 except Exception:
                     pass
 
@@ -252,7 +266,7 @@ class VoiceLiteSession:
                 # Brief silence while speaking — keep sending for ASR context
                 if self._asr_client:
                     try:
-                        await self._asr_client.send_audio(audio_data)
+                        await self._asr_client.send_audio(audio_for_asr)
                     except Exception:
                         pass
 
@@ -266,7 +280,7 @@ class VoiceLiteSession:
                         self._vad_finish_task = asyncio.create_task(self._vad_send_finish())
             else:
                 # Not speaking — buffer recent audio for pre-speech context
-                self._pre_buffer.append(audio_data)
+                self._pre_buffer.append(audio_for_asr)
                 if len(self._pre_buffer) > self.VAD_PRE_BUFFER_SIZE:
                     self._pre_buffer.pop(0)
 
@@ -377,6 +391,15 @@ class VoiceLiteSession:
             if self._silence_timer_task and not self._silence_timer_task.done():
                 self._silence_timer_task.cancel()
 
+            # Reset VAD state for next utterance
+            self._vad_speaking = False
+            self._vad_silence_start = 0
+            self._vad_voice_streak = 0
+            self._vad_silence_streak = 0
+            if self._vad_finish_task and not self._vad_finish_task.done():
+                self._vad_finish_task.cancel()
+                self._vad_finish_task = None
+
             # Interrupt any in-progress agent+TTS
             await self._interrupt_current()