qwen_agent/services/voice_utils.py

import json
import re
import logging
from typing import Optional, AsyncGenerator

logger = logging.getLogger('app')

SENTENCE_END_RE = re.compile(r'[。！？；\n.!?;]')

# Emoji pattern: matches Unicode emoji without touching CJK characters
_EMOJI_RE = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map
    "\U0001F1E0-\U0001F1FF"  # flags
    "\U0001F900-\U0001F9FF"  # supplemental symbols
    "\U0001FA00-\U0001FA6F"  # chess symbols
    "\U0001FA70-\U0001FAFF"  # symbols extended-A
    "\U00002702-\U000027B0"  # dingbats
    "\U00002600-\U000026FF"  # misc symbols
    "\U0000FE00-\U0000FE0F"  # variation selectors
    "\U0000200D"             # zero width joiner
    "\U000024C2"             # Ⓜ enclosed letter
    "\U00002B50\U00002B55"   # star, circle
    "\U000023CF\U000023E9-\U000023F3\U000023F8-\U000023FA"  # media controls
    "\U0001F170-\U0001F251"  # enclosed alphanumeric supplement
    "]+",
    flags=re.UNICODE,
)

# Strong sentence-ending punctuation (excluding \n which is handled separately)
_STRONG_PUNCT_RE = re.compile(r'[。！？；.!?;～~]')
# Soft punctuation (usable as split points when buffer is getting long)
_SOFT_PUNCT_RE = re.compile(r'[，,：:、）) \t]')


class TTSSentenceSplitter:
    """
    Intelligent sentence splitter for TTS streaming.

    Rules (in priority order):
    1. Split on newlines unconditionally (LLM paragraph boundaries)
    2. Split on strong punctuation (。！？～ etc.) only if accumulated >= MIN_LENGTH
    3. If buffer reaches SOFT_THRESHOLD, also split on soft punctuation (，、etc.)
    4. If buffer reaches MAX_LENGTH, force split at best available position
    - Strip emoji from output (TTS cannot pronounce them)
    - On flush(), return any remaining text regardless of length
    """

    MIN_LENGTH = 10    # Don't send sentences shorter than this
    SOFT_THRESHOLD = 30  # Start considering soft punctuation splits
    MAX_LENGTH = 80    # Force split even without punctuation

    def __init__(self):
        self._buf = ""

    def _clean_for_tts(self, text: str) -> str:
        """Remove emoji and collapse whitespace."""
        text = _EMOJI_RE.sub("", text)
        text = re.sub(r'[ \t]+', ' ', text)
        return text.strip()

    def feed(self, chunk: str) -> list[str]:
        """Feed a text chunk, return list of ready sentences (may be empty)."""
        self._buf += chunk
        results = []

        while self._buf:
            buf_len = len(self._buf)

            # 0. Newline split — highest priority
            nl_pos = self._buf.find('\n')
            if nl_pos >= 0:
                before = self._buf[:nl_pos]
                rest = self._buf[nl_pos:].lstrip('\n')
                cleaned = self._clean_for_tts(before)
                if len(cleaned) >= self.MIN_LENGTH:
                    # Long enough, emit as a sentence
                    self._buf = rest
                    results.append(cleaned)
                    continue
                elif not rest:
                    # No more text after newline, keep buffer and wait
                    break
                else:
                    # Too short — merge with next paragraph
                    self._buf = before + rest
                    continue

            # 1. Try strong punctuation split — scan for the best split point
            best_end = -1
            for match in _STRONG_PUNCT_RE.finditer(self._buf):
                end_pos = match.end()
                candidate = self._buf[:end_pos]
                if len(candidate.strip()) >= self.MIN_LENGTH:
                    best_end = end_pos
                    break  # Take the first valid (long enough) split
                # Short segment before this punct — skip and keep scanning

            if best_end > 0:
                sentence = self._clean_for_tts(self._buf[:best_end])
                self._buf = self._buf[best_end:]
                if sentence:
                    results.append(sentence)
                continue

            # 2. Buffer getting long: try soft punctuation split
            if buf_len >= self.SOFT_THRESHOLD:
                best_soft = -1
                for m in _SOFT_PUNCT_RE.finditer(self._buf):
                    pos = m.end()
                    if pos >= self.MIN_LENGTH:
                        best_soft = pos
                        if pos >= self.SOFT_THRESHOLD:
                            break
                if best_soft >= self.MIN_LENGTH:
                    sentence = self._clean_for_tts(self._buf[:best_soft])
                    self._buf = self._buf[best_soft:]
                    if sentence:
                        results.append(sentence)
                    continue

            # 3. Buffer too long: force split at MAX_LENGTH
            if buf_len >= self.MAX_LENGTH:
                split_at = self.MAX_LENGTH
                search_region = self._buf[self.MIN_LENGTH:self.MAX_LENGTH]
                last_space = max(search_region.rfind(' '), search_region.rfind('，'),
                                search_region.rfind(','), search_region.rfind('、'))
                if last_space >= 0:
                    split_at = self.MIN_LENGTH + last_space + 1

                sentence = self._clean_for_tts(self._buf[:split_at])
                self._buf = self._buf[split_at:]
                if sentence:
                    results.append(sentence)
                continue

            # Not enough text yet, wait for more
            break

        return results

    def flush(self) -> list[str]:
        """Flush remaining buffer. Call at end of stream."""
        results = []
        if self._buf.strip():
            sentence = self._clean_for_tts(self._buf)
            if sentence:
                results.append(sentence)
        self._buf = ""
        return results


class StreamTagFilter:
    """
    Filters streaming text based on tag blocks.
    Only passes through content inside [ANSWER] blocks.
    If no tags are found at all, passes through everything (fallback).
    Skips content inside [TOOL_CALL], [TOOL_RESPONSE], [THINK], [SOURCE], etc.
    """

    SKIP_TAGS = {"TOOL_CALL", "TOOL_RESPONSE", "THINK", "SOURCE", "REFERENCE"}
    KNOWN_TAGS = {"ANSWER", "TOOL_CALL", "TOOL_RESPONSE", "THINK", "SOURCE", "REFERENCE", "PREAMBLE", "SUMMARY"}

    def __init__(self):
        self.state = "idle"  # idle, answer, skip
        self.found_any_tag = False
        self._pending = ""
        self.answer_ended = False

    def feed(self, chunk: str) -> str:
        """Feed a chunk, return text that should be passed to TTS."""
        self.answer_ended = False
        self._pending += chunk
        output = []

        while self._pending:
            if self.state in ("idle", "answer"):
                bracket_pos = self._pending.find("[")
                if bracket_pos == -1:
                    if self.state == "answer" or not self.found_any_tag:
                        output.append(self._pending)
                    self._pending = ""
                else:
                    before = self._pending[:bracket_pos]
                    if before and (self.state == "answer" or not self.found_any_tag):
                        output.append(before)

                    close_pos = self._pending.find("]", bracket_pos)
                    if close_pos == -1:
                        self._pending = self._pending[bracket_pos:]
                        break

                    tag_name = self._pending[bracket_pos + 1:close_pos]
                    self._pending = self._pending[close_pos + 1:]

                    if tag_name not in self.KNOWN_TAGS:
                        if self.state == "answer" or not self.found_any_tag:
                            output.append(f"[{tag_name}]")
                        continue

                    self.found_any_tag = True
                    if tag_name == "ANSWER":
                        self.state = "answer"
                    else:
                        if self.state == "answer":
                            self.answer_ended = True
                        self.state = "skip"

            elif self.state == "skip":
                bracket_pos = self._pending.find("[")
                if bracket_pos == -1:
                    self._pending = ""
                else:
                    close_pos = self._pending.find("]", bracket_pos)
                    if close_pos == -1:
                        self._pending = self._pending[bracket_pos:]
                        break

                    tag_name = self._pending[bracket_pos + 1:close_pos]
                    self._pending = self._pending[close_pos + 1:]

                    if tag_name not in self.KNOWN_TAGS:
                        continue

                    if tag_name == "ANSWER":
                        self.state = "answer"
                    else:
                        self.state = "skip"

        return "".join(output)


def clean_markdown(text: str) -> str:
    """Strip Markdown formatting characters for TTS readability."""
    text = re.sub(r'!\[([^\]]*)\]\([^)]*\)', r'\1', text)
    text = re.sub(r'\[([^\]]*)\]\([^)]*\)', r'\1', text)
    text = re.sub(r'#{1,6}\s*', '', text)
    text = re.sub(r'\*{1,3}|_{1,3}|~~|`{1,3}', '', text)
    text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
    text = re.sub(r'^[\s]*[-*_]{3,}[\s]*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'\n{2,}', '\n', text)
    return text.strip()


async def stream_v3_agent(
    user_text: str,
    bot_id: str,
    bot_config: dict,
    session_id: str,
    user_identifier: str,
) -> AsyncGenerator[str, None]:
    """Call v3 agent API in streaming mode, yield text chunks as they arrive."""
    import asyncio
    try:
        from utils.api_models import ChatRequestV3, Message
        from utils.fastapi_utils import (
            process_messages,
            create_project_directory,
        )
        from agent.agent_config import AgentConfig
        from routes.chat import enhanced_generate_stream_response

        language = bot_config.get("language", "zh")
        messages_obj = [Message(role="user", content=user_text)]

        request = ChatRequestV3(
            messages=messages_obj,
            bot_id=bot_id,
            stream=True,
            session_id=session_id,
            user_identifier=user_identifier,
        )

        project_dir = create_project_directory(
            bot_config.get("dataset_ids", []),
            bot_id,
            bot_config.get("skills", []),
        )

        processed_messages = process_messages(messages_obj, language)

        config = await AgentConfig.from_v3_request(
            request,
            bot_config,
            project_dir,
            processed_messages,
            language,
        )
        config.stream = True

        async for sse_line in enhanced_generate_stream_response(config):
            if not sse_line or not sse_line.startswith("data: "):
                continue
            data_str = sse_line.strip().removeprefix("data: ")
            if data_str == "[DONE]":
                break
            try:
                data = json.loads(data_str)
                choices = data.get("choices", [])
                if choices:
                    delta = choices[0].get("delta", {})
                    content = delta.get("content", "")
                    if content:
                        yield content
            except (json.JSONDecodeError, KeyError):
                continue

    except asyncio.CancelledError:
        logger.info(f"[Voice] v3 agent call cancelled")
        raise
    except Exception as e:
        logger.error(f"[Voice] Error calling v3 agent: {e}", exc_info=True)