From 7a547322e3de5b50ae960d66b422e67d7e0ce737 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= <zhuchaowe@users.noreply.github.com>
Date: Sun, 22 Mar 2026 00:42:57 +0800
Subject: [PATCH] =?UTF-8?q?=E8=AF=AD=E4=B9=89=E5=88=86=E5=89=B2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 poetry.lock                    |  36 +++++----
 pyproject.toml                 |   4 +-
 requirements.txt               |   3 +-
 services/voice_lite_session.py | 104 +++++++++++++++---------
 services/voice_utils.py        | 144 +++++++++++++++++++++++++++++++++
 5 files changed, 236 insertions(+), 55 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 2111ea3..85ce8bf 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -5302,25 +5302,19 @@ train = ["accelerate (>=0.20.3)", "datasets"]
 
 [[package]]
 name = "setuptools"
-version = "82.0.1"
-description = "Most extensible Python build backend with support for C/C++ extension modules"
+version = "70.3.0"
+description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
-python-versions = ">=3.9"
+python-versions = ">=3.8"
 groups = ["main"]
-markers = "python_version >= \"3.13\""
 files = [
-    {file = "setuptools-82.0.1-py3-none-any.whl", hash = "sha256:a59e362652f08dcd477c78bb6e7bd9d80a7995bc73ce773050228a348ce2e5bb"},
-    {file = "setuptools-82.0.1.tar.gz", hash = "sha256:7d872682c5d01cfde07da7bccc7b65469d3dca203318515ada1de5eda35efbf9"},
+    {file = "setuptools-70.3.0-py3-none-any.whl", hash = "sha256:fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc"},
+    {file = "setuptools-70.3.0.tar.gz", hash = "sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5"},
 ]
 
 [package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.13.0) ; sys_platform != \"cygwin\""]
-core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"]
-cover = ["pytest-cov"]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
-enabler = ["pytest-enabler (>=2.2)"]
-test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
-type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.18.*)", "pytest-mypy"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.3.2) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
 name = "shellingham"
@@ -6347,6 +6341,20 @@ files = [
     {file = "wcwidth-0.6.0.tar.gz", hash = "sha256:cdc4e4262d6ef9a1a57e018384cbeb1208d8abbc64176027e2c2455c81313159"},
 ]
 
+[[package]]
+name = "webrtcvad"
+version = "2.0.10"
+description = "Python interface to the Google WebRTC Voice Activity Detector (VAD)"
+optional = false
+python-versions = "*"
+groups = ["main"]
+files = [
+    {file = "webrtcvad-2.0.10.tar.gz", hash = "sha256:f1bed2fb25b63fb7b1a55d64090c993c9c9167b28485ae0bcdd81cf6ede96aea"},
+]
+
+[package.extras]
+dev = ["check-manifest", "memory_profiler", "nose", "psutil", "unittest2", "zest.releaser"]
+
 [[package]]
 name = "websockets"
 version = "15.0.1"
@@ -6983,4 +6991,4 @@ cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and pyt
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.12,<3.15"
-content-hash = "1461514ed1f9639f41f43ebb28f2a3fcd2d5a5dde954cd509c0ea7bf181e9bb6"
+content-hash = "c9c4f80cdbf7d6bce20f65f40b9adce05c5f4a830299de148fcd8482937bddb0"
diff --git a/pyproject.toml b/pyproject.toml
index 8c563d4..494e70d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,9 @@ dependencies = [
     "ragflow-sdk (>=0.23.0,<0.24.0)",
     "httpx (>=0.28.1,<0.29.0)",
     "wsgidav (>=4.3.3,<5.0.0)",
-    "websockets (>=15.0.0,<16.0.0)"
+    "websockets (>=15.0.0,<16.0.0)",
+    "setuptools (<71)",
+    "webrtcvad (>=2.0.10,<3.0.0)",
 ]
 
 [tool.poetry.requires-plugins]
diff --git a/requirements.txt b/requirements.txt
index 8e1d8bf..dd2a8ce 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -165,7 +165,7 @@ safetensors==0.7.0 ; python_version >= "3.12" and python_version < "3.15"
 scikit-learn==1.8.0 ; python_version >= "3.12" and python_version < "3.15"
 scipy==1.17.1 ; python_version >= "3.12" and python_version < "3.15"
 sentence-transformers==3.4.1 ; python_version >= "3.12" and python_version < "3.15"
-setuptools==82.0.1 ; python_version >= "3.13" and python_version < "3.15"
+setuptools==70.3.0 ; python_version >= "3.12" and python_version < "3.15"
 shellingham==1.5.4 ; python_version >= "3.12" and python_version < "3.15"
 six==1.17.0 ; python_version >= "3.12" and python_version < "3.15"
 sniffio==1.3.1 ; python_version >= "3.12" and python_version < "3.15"
@@ -203,6 +203,7 @@ uvloop==0.22.1 ; python_version >= "3.12" and python_version < "3.15"
 watchfiles==1.1.1 ; python_version >= "3.12" and python_version < "3.15"
 wcmatch==10.1 ; python_version >= "3.12" and python_version < "3.15"
 wcwidth==0.6.0 ; python_version >= "3.12" and python_version < "3.15"
+webrtcvad==2.0.10 ; python_version >= "3.12" and python_version < "3.15"
 websockets==15.0.1 ; python_version >= "3.12" and python_version < "3.15"
 wrapt==1.17.3 ; python_version >= "3.12" and python_version < "3.15"
 wsgidav==4.3.3 ; python_version >= "3.12" and python_version < "3.15"
diff --git a/services/voice_lite_session.py b/services/voice_lite_session.py
index f5a2506..c77ff9d 100644
--- a/services/voice_lite_session.py
+++ b/services/voice_lite_session.py
@@ -4,13 +4,15 @@ import struct
 import uuid
 from typing import Optional, Callable, Awaitable
 
+import webrtcvad
+
 from services.streaming_asr_client import StreamingASRClient
 from services.streaming_tts_client import StreamingTTSClient
 from services.voice_utils import (
     StreamTagFilter,
     clean_markdown,
     stream_v3_agent,
-    SENTENCE_END_RE,
+    TTSSentenceSplitter,
 )
 from utils.settings import VOICE_LITE_SILENCE_TIMEOUT
 
@@ -63,7 +65,8 @@ class VoiceLiteSession:
         self._last_asr_emit_time: float = 0
         self._utterance_lock = asyncio.Lock()
 
-        # VAD (Voice Activity Detection) state
+        # VAD (Voice Activity Detection) via webrtcvad
+        self._vad = webrtcvad.Vad(2)  # aggressiveness 0-3 (2 = balanced)
         self._vad_speaking = False  # Whether user is currently speaking
         self._vad_silence_start: float = 0  # When silence started
         self._vad_finish_task: Optional[asyncio.Task] = None
@@ -105,23 +108,52 @@ class VoiceLiteSession:
             await self._asr_client.close()
 
     # VAD configuration
-    VAD_ENERGY_THRESHOLD = 500  # RMS energy threshold for voice detection
     VAD_SILENCE_DURATION = 1.5  # Seconds of silence before sending finish
     VAD_PRE_BUFFER_SIZE = 5     # Number of audio chunks to buffer before VAD triggers
+    VAD_SOURCE_RATE = 24000     # Input audio sample rate
+    VAD_TARGET_RATE = 16000     # webrtcvad supported sample rate
+    VAD_FRAME_DURATION_MS = 30  # Frame duration for webrtcvad (10, 20, or 30 ms)
 
     _audio_chunk_count = 0
 
     @staticmethod
-    def _calc_rms(pcm_data: bytes) -> float:
-        """Calculate RMS energy of 16-bit PCM audio."""
-        if len(pcm_data) < 2:
-            return 0.0
+    def _resample_24k_to_16k(pcm_data: bytes) -> bytes:
+        """Downsample 16-bit PCM from 24kHz to 16kHz (ratio 3:2).
+
+        Takes every 2 out of 3 samples (simple decimation).
+        """
         n_samples = len(pcm_data) // 2
+        if n_samples == 0:
+            return b''
         samples = struct.unpack(f'<{n_samples}h', pcm_data[:n_samples * 2])
-        if not samples:
-            return 0.0
-        sum_sq = sum(s * s for s in samples)
-        return (sum_sq / n_samples) ** 0.5
+        # Pick samples at indices 0, 1.5, 3, 4.5, ... -> floor(i * 3/2) for output index i
+        out_len = (n_samples * 2) // 3
+        resampled = []
+        for i in range(out_len):
+            src_idx = (i * 3) // 2
+            if src_idx < n_samples:
+                resampled.append(samples[src_idx])
+        return struct.pack(f'<{len(resampled)}h', *resampled)
+
+    def _webrtcvad_detect(self, pcm_data: bytes) -> bool:
+        """Run webrtcvad on audio data. Returns True if voice is detected in any frame."""
+        resampled = self._resample_24k_to_16k(pcm_data)
+        frame_size = (self.VAD_TARGET_RATE * self.VAD_FRAME_DURATION_MS // 1000) * 2  # bytes per frame
+        if len(resampled) < frame_size:
+            return False
+        # Check frames; return True if any frame has voice
+        voice_frames = 0
+        total_frames = 0
+        for offset in range(0, len(resampled) - frame_size + 1, frame_size):
+            frame = resampled[offset:offset + frame_size]
+            total_frames += 1
+            try:
+                if self._vad.is_speech(frame, self.VAD_TARGET_RATE):
+                    voice_frames += 1
+            except Exception:
+                pass
+        # Consider voice detected if at least one frame has speech
+        return voice_frames > 0
 
     async def handle_audio(self, audio_data: bytes) -> None:
         """Forward user audio to ASR with VAD gating. Lazy-connect on speech start."""
@@ -129,8 +161,7 @@ class VoiceLiteSession:
             return
 
         self._audio_chunk_count += 1
-        rms = self._calc_rms(audio_data)
-        has_voice = rms > self.VAD_ENERGY_THRESHOLD
+        has_voice = self._webrtcvad_detect(audio_data)
         now = asyncio.get_event_loop().time()
 
         if has_voice:
@@ -142,7 +173,7 @@ class VoiceLiteSession:
             if not self._vad_speaking:
                 # Speech just started — connect ASR
                 self._vad_speaking = True
-                logger.info(f"[VoiceLite] VAD: speech started (rms={rms:.0f}), connecting ASR...")
+                logger.info(f"[VoiceLite] VAD: speech started (webrtcvad), connecting ASR...")
                 try:
                     await self._connect_asr()
                     # Send buffered pre-speech audio
@@ -320,8 +351,8 @@ class VoiceLiteSession:
             await self._emit_status("thinking")
 
             accumulated_text = []
-            sentence_buf = ""
             tag_filter = StreamTagFilter()
+            splitter = TTSSentenceSplitter()
             tts_client = StreamingTTSClient(speaker=self._speaker)
             speaking = False
 
@@ -340,26 +371,20 @@ class VoiceLiteSession:
                 passthrough = tag_filter.feed(chunk)
 
                 if not passthrough:
-                    if tag_filter.answer_ended and sentence_buf:
-                        flush = clean_markdown(sentence_buf.strip())
-                        sentence_buf = ""
-                        if flush:
-                            if not speaking:
-                                await self._emit_status("speaking")
-                                speaking = True
-                            await self._send_tts(tts_client, flush)
+                    if tag_filter.answer_ended:
+                        for sentence in splitter.flush():
+                            sentence = clean_markdown(sentence)
+                            if sentence:
+                                if not speaking:
+                                    await self._emit_status("speaking")
+                                    speaking = True
+                                await self._send_tts(tts_client, sentence)
                     continue
 
-                sentence_buf += passthrough
-
-                while True:
-                    match = SENTENCE_END_RE.search(sentence_buf)
-                    if not match:
-                        break
-                    end_pos = match.end()
-                    sentence = clean_markdown(sentence_buf[:end_pos].strip())
-                    sentence_buf = sentence_buf[end_pos:]
-
+                # Feed raw passthrough to splitter (preserve newlines for splitting),
+                # apply clean_markdown on output sentences
+                for sentence in splitter.feed(passthrough):
+                    sentence = clean_markdown(sentence)
                     if sentence:
                         if not speaking:
                             await self._emit_status("speaking")
@@ -367,12 +392,13 @@ class VoiceLiteSession:
                         await self._send_tts(tts_client, sentence)
 
             # Handle remaining text
-            remaining = clean_markdown(sentence_buf.strip())
-            if remaining:
-                if not speaking:
-                    await self._emit_status("speaking")
-                    speaking = True
-                await self._send_tts(tts_client, remaining)
+            for sentence in splitter.flush():
+                sentence = clean_markdown(sentence)
+                if sentence:
+                    if not speaking:
+                        await self._emit_status("speaking")
+                        speaking = True
+                    await self._send_tts(tts_client, sentence)
 
             # Log full agent result (not sent to frontend, already streamed)
             full_result = "".join(accumulated_text)
diff --git a/services/voice_utils.py b/services/voice_utils.py
index 800b28d..92da9fc 100644
--- a/services/voice_utils.py
+++ b/services/voice_utils.py
@@ -7,6 +7,150 @@ logger = logging.getLogger('app')
 
 SENTENCE_END_RE = re.compile(r'[。！？；\n.!?;]')
 
+# Emoji pattern: matches Unicode emoji without touching CJK characters
+_EMOJI_RE = re.compile(
+    "["
+    "\U0001F600-\U0001F64F"  # emoticons
+    "\U0001F300-\U0001F5FF"  # symbols & pictographs
+    "\U0001F680-\U0001F6FF"  # transport & map
+    "\U0001F1E0-\U0001F1FF"  # flags
+    "\U0001F900-\U0001F9FF"  # supplemental symbols
+    "\U0001FA00-\U0001FA6F"  # chess symbols
+    "\U0001FA70-\U0001FAFF"  # symbols extended-A
+    "\U00002702-\U000027B0"  # dingbats
+    "\U00002600-\U000026FF"  # misc symbols
+    "\U0000FE00-\U0000FE0F"  # variation selectors
+    "\U0000200D"             # zero width joiner
+    "\U000024C2"             # Ⓜ enclosed letter
+    "\U00002B50\U00002B55"   # star, circle
+    "\U000023CF\U000023E9-\U000023F3\U000023F8-\U000023FA"  # media controls
+    "\U0001F170-\U0001F251"  # enclosed alphanumeric supplement
+    "]+",
+    flags=re.UNICODE,
+)
+
+# Strong sentence-ending punctuation (excluding \n which is handled separately)
+_STRONG_PUNCT_RE = re.compile(r'[。！？；.!?;～~]')
+# Soft punctuation (usable as split points when buffer is getting long)
+_SOFT_PUNCT_RE = re.compile(r'[，,：:、）) \t]')
+
+
+class TTSSentenceSplitter:
+    """
+    Intelligent sentence splitter for TTS streaming.
+
+    Rules (in priority order):
+    1. Split on newlines unconditionally (LLM paragraph boundaries)
+    2. Split on strong punctuation (。！？～ etc.) only if accumulated >= MIN_LENGTH
+    3. If buffer reaches SOFT_THRESHOLD, also split on soft punctuation (，、etc.)
+    4. If buffer reaches MAX_LENGTH, force split at best available position
+    - Strip emoji from output (TTS cannot pronounce them)
+    - On flush(), return any remaining text regardless of length
+    """
+
+    MIN_LENGTH = 10    # Don't send sentences shorter than this
+    SOFT_THRESHOLD = 30  # Start considering soft punctuation splits
+    MAX_LENGTH = 80    # Force split even without punctuation
+
+    def __init__(self):
+        self._buf = ""
+
+    def _clean_for_tts(self, text: str) -> str:
+        """Remove emoji and collapse whitespace."""
+        text = _EMOJI_RE.sub("", text)
+        text = re.sub(r'[ \t]+', ' ', text)
+        return text.strip()
+
+    def feed(self, chunk: str) -> list[str]:
+        """Feed a text chunk, return list of ready sentences (may be empty)."""
+        self._buf += chunk
+        results = []
+
+        while self._buf:
+            buf_len = len(self._buf)
+
+            # 0. Newline split — highest priority
+            nl_pos = self._buf.find('\n')
+            if nl_pos >= 0:
+                before = self._buf[:nl_pos]
+                rest = self._buf[nl_pos:].lstrip('\n')
+                cleaned = self._clean_for_tts(before)
+                if len(cleaned) >= self.MIN_LENGTH:
+                    # Long enough, emit as a sentence
+                    self._buf = rest
+                    results.append(cleaned)
+                    continue
+                elif not rest:
+                    # No more text after newline, keep buffer and wait
+                    break
+                else:
+                    # Too short — merge with next paragraph
+                    self._buf = before + rest
+                    continue
+
+            # 1. Try strong punctuation split — scan for the best split point
+            best_end = -1
+            for match in _STRONG_PUNCT_RE.finditer(self._buf):
+                end_pos = match.end()
+                candidate = self._buf[:end_pos]
+                if len(candidate.strip()) >= self.MIN_LENGTH:
+                    best_end = end_pos
+                    break  # Take the first valid (long enough) split
+                # Short segment before this punct — skip and keep scanning
+
+            if best_end > 0:
+                sentence = self._clean_for_tts(self._buf[:best_end])
+                self._buf = self._buf[best_end:]
+                if sentence:
+                    results.append(sentence)
+                continue
+
+            # 2. Buffer getting long: try soft punctuation split
+            if buf_len >= self.SOFT_THRESHOLD:
+                best_soft = -1
+                for m in _SOFT_PUNCT_RE.finditer(self._buf):
+                    pos = m.end()
+                    if pos >= self.MIN_LENGTH:
+                        best_soft = pos
+                        if pos >= self.SOFT_THRESHOLD:
+                            break
+                if best_soft >= self.MIN_LENGTH:
+                    sentence = self._clean_for_tts(self._buf[:best_soft])
+                    self._buf = self._buf[best_soft:]
+                    if sentence:
+                        results.append(sentence)
+                    continue
+
+            # 3. Buffer too long: force split at MAX_LENGTH
+            if buf_len >= self.MAX_LENGTH:
+                split_at = self.MAX_LENGTH
+                search_region = self._buf[self.MIN_LENGTH:self.MAX_LENGTH]
+                last_space = max(search_region.rfind(' '), search_region.rfind('，'),
+                                search_region.rfind(','), search_region.rfind('、'))
+                if last_space >= 0:
+                    split_at = self.MIN_LENGTH + last_space + 1
+
+                sentence = self._clean_for_tts(self._buf[:split_at])
+                self._buf = self._buf[split_at:]
+                if sentence:
+                    results.append(sentence)
+                continue
+
+            # Not enough text yet, wait for more
+            break
+
+        return results
+
+    def flush(self) -> list[str]:
+        """Flush remaining buffer. Call at end of stream."""
+        results = []
+        if self._buf.strip():
+            sentence = self._clean_for_tts(self._buf)
+            if sentence:
+                results.append(sentence)
+        self._buf = ""
+        return results
+
 
 class StreamTagFilter:
     """