diff --git a/services/voice_session_manager.py b/services/voice_session_manager.py index 7c8ca2b..437f46a 100644 --- a/services/voice_session_manager.py +++ b/services/voice_session_manager.py @@ -19,6 +19,7 @@ class _StreamTagFilter: """ SKIP_TAGS = {"TOOL_CALL", "TOOL_RESPONSE", "THINK", "SOURCE", "REFERENCE"} + KNOWN_TAGS = {"ANSWER", "TOOL_CALL", "TOOL_RESPONSE", "THINK", "SOURCE", "REFERENCE", "PREAMBLE", "SUMMARY"} def __init__(self): self.state = "idle" # idle, answer, skip @@ -50,8 +51,13 @@ class _StreamTagFilter: tag_name = self._pending[bracket_pos + 1:close_pos] self._pending = self._pending[close_pos + 1:] - self.found_any_tag = True + if tag_name not in self.KNOWN_TAGS: + if self.state == "answer" or not self.found_any_tag: + output.append(f"[{tag_name}]") + continue + + self.found_any_tag = True if tag_name == "ANSWER": self.state = "answer" else: @@ -70,6 +76,9 @@ class _StreamTagFilter: tag_name = self._pending[bracket_pos + 1:close_pos] self._pending = self._pending[close_pos + 1:] + if tag_name not in self.KNOWN_TAGS: + continue + if tag_name == "ANSWER": self.state = "answer" else: @@ -115,6 +124,11 @@ class VoiceSession: self._current_asr_text = "" # When True, discard TTS audio from SERVER_ACK (comfort speech period) self._is_sending_chat_tts_text = False + # Set to True when event 350 fires for chat_tts_text, indicating the TTS segment is done + # and next TTS send must use start=True to begin a new session + self._tts_segment_done = False + # Signaled when event 359 fires (TTS fully completed), used to wait before starting new TTS + self._tts_complete_event: asyncio.Event = asyncio.Event() self._receive_task: Optional[asyncio.Task] = None self._agent_task: Optional[asyncio.Task] = None @@ -252,10 +266,14 @@ class VoiceSession: self._is_sending_chat_tts_text = False logger.info(f"[Voice] Comfort/RAG TTS done, resuming audio forwarding") + # Mark TTS segment as done so next send uses start=True + if tts_type == "chat_tts_text": + self._tts_segment_done = True + elif event == 359: # TTS fully completed (all segments done) logger.info(f"[Voice] TTS fully completed") - # await self._emit_status("idle") + self._tts_complete_event.set() elif event in (152, 153): logger.info(f"[Voice] Session finished event: {event}") @@ -331,26 +349,59 @@ class VoiceSession: if sentence: sentence = self._clean_markdown(sentence) if sentence: + # If previous TTS segment completed (e.g. gap during tool call), + # close old session, wait for TTS delivery to finish, then restart + if tts_started and self._tts_segment_done: + logger.info(f"[Voice] TTS segment done, closing session and waiting for delivery") + await self.realtime_client.chat_tts_text(content="", start=False, end=True) + self._tts_complete_event.clear() + try: + await asyncio.wait_for(self._tts_complete_event.wait(), timeout=10) + except asyncio.TimeoutError: + logger.warning(f"[Voice] Timeout waiting for TTS complete, proceeding anyway") + tts_started = False + self._tts_segment_done = False + logger.info(f"[Voice] TTS delivery done, starting new session") + logger.info(f"[Voice] Sending TTS sentence: '{sentence[:80]}'") await self.realtime_client.chat_tts_text( content=sentence, start=not tts_started, end=False, ) + if not tts_started: + await self._emit_status("speaking") tts_started = True + self._tts_segment_done = False # Handle remaining text in buffer (last sentence without ending punctuation) remaining = sentence_buf.strip() if remaining: remaining = self._clean_markdown(remaining) if remaining: + # If previous TTS segment completed, close and wait before restart + if tts_started and self._tts_segment_done: + logger.info(f"[Voice] TTS segment done, closing session and waiting for delivery (remaining)") + await self.realtime_client.chat_tts_text(content="", start=False, end=True) + self._tts_complete_event.clear() + try: + await asyncio.wait_for(self._tts_complete_event.wait(), timeout=10) + except asyncio.TimeoutError: + logger.warning(f"[Voice] Timeout waiting for TTS complete, proceeding anyway") + tts_started = False + self._tts_segment_done = False + logger.info(f"[Voice] TTS delivery done, starting new session for remaining") + logger.info(f"[Voice] Sending TTS remaining: '{remaining[:80]}'") await self.realtime_client.chat_tts_text( content=remaining, start=not tts_started, end=False, ) + if not tts_started: + await self._emit_status("speaking") tts_started = True + self._tts_segment_done = False # Send TTS end signal if tts_started: