diff --git a/agent/agent_config.py b/agent/agent_config.py index 264198b..1fe0f8c 100644 --- a/agent/agent_config.py +++ b/agent/agent_config.py @@ -83,7 +83,7 @@ class AgentConfig: safe_dict = self.to_dict().copy() if 'api_key' in safe_dict and isinstance(safe_dict['api_key'], str) and safe_dict['api_key'].startswith('sk-'): safe_dict['api_key'] = safe_dict['api_key'][:8] + '***' + safe_dict['api_key'][-6:] - logger.info(f"config={json.dumps(safe_dict, ensure_ascii=False)}") + logger.info(f"config={json.dumps(safe_dict, ensure_ascii=False, default=str)}") @classmethod async def from_v1_request(cls, request, api_key: str, project_dir: Optional[str] = None, generate_cfg: Optional[Dict] = None, messages: Optional[List] = None): diff --git a/agent/deep_assistant.py b/agent/deep_assistant.py index df64c5a..4a820ee 100644 --- a/agent/deep_assistant.py +++ b/agent/deep_assistant.py @@ -288,9 +288,9 @@ async def init_agent(config: AgentConfig): checkpointer=checkpointer, shell_env={ k: v for k, v in { - "ASSISTANT_ID": config.bot_id, - "USER_IDENTIFIER": config.user_identifier, - "TRACE_ID": config.trace_id, + "ASSISTANT_ID": str(config.bot_id), + "USER_IDENTIFIER": str(config.user_identifier) if config.user_identifier else None, + "TRACE_ID": str(config.trace_id) if config.trace_id else None, **(config.shell_env or {}), }.items() if v is not None } diff --git a/routes/voice.py b/routes/voice.py index 851a886..499f14d 100644 --- a/routes/voice.py +++ b/routes/voice.py @@ -93,11 +93,13 @@ async def voice_realtime(websocket: WebSocket): continue voice_mode = msg.get("voice_mode") or VOICE_DEFAULT_MODE + client_sample_rate = msg.get("sample_rate", 24000) session_kwargs = dict( bot_id=bot_id, session_id=msg.get("session_id"), user_identifier=msg.get("user_identifier"), + sample_rate=client_sample_rate, on_audio=on_audio, on_asr_text=on_asr_text, on_agent_result=on_agent_result, diff --git a/services/streaming_tts_client.py b/services/streaming_tts_client.py index a87938f..af15cfd 100644 --- a/services/streaming_tts_client.py +++ b/services/streaming_tts_client.py @@ -37,6 +37,22 @@ class StreamingTTSClient: self._speaker = speaker or VOLCENGINE_DEFAULT_SPEAKER async def synthesize(self, text: str): + """ + Synthesize text to audio via SSE streaming. + Yields 24kHz float32 PCM audio chunks. + """ + async for chunk in self._synthesize_internal(text, raw_int16=False): + yield chunk + + async def synthesize_raw(self, text: str): + """ + Synthesize text to audio via SSE streaming. + Yields 24kHz int16 PCM audio chunks (no float32 conversion). + """ + async for chunk in self._synthesize_internal(text, raw_int16=True): + yield chunk + + async def _synthesize_internal(self, text: str, raw_int16: bool = False): """ Synthesize text to audio via SSE streaming. Yields 24kHz float32 PCM audio chunks. @@ -97,7 +113,7 @@ class StreamingTTSClient: if line == "": # Blank line = end of one SSE event if current_data: - async for audio in self._process_sse_data(current_data): + async for audio in self._process_sse_data(current_data, raw_int16=raw_int16): chunk_count += 1 yield audio current_event = "" @@ -118,7 +134,7 @@ class StreamingTTSClient: # Handle remaining data without trailing blank line if current_data: - async for audio in self._process_sse_data(current_data): + async for audio in self._process_sse_data(current_data, raw_int16=raw_int16): chunk_count += 1 yield audio @@ -127,7 +143,7 @@ class StreamingTTSClient: except Exception as e: logger.error(f"[TTS] Error: {e}", exc_info=True) - async def _process_sse_data(self, data_str: str): + async def _process_sse_data(self, data_str: str, raw_int16: bool = False): """Parse SSE data field and yield audio chunks if present.""" data_str = data_str.rstrip("\n") if not data_str: @@ -143,9 +159,13 @@ class StreamingTTSClient: if code == 0 and data.get("data"): # Audio data chunk pcm_raw = base64.b64decode(data["data"]) - pcm_f32 = convert_pcm_s16_to_f32(pcm_raw) - if pcm_f32: - yield pcm_f32 + if raw_int16: + if pcm_raw: + yield pcm_raw + else: + pcm_f32 = convert_pcm_s16_to_f32(pcm_raw) + if pcm_f32: + yield pcm_f32 elif code == 20000000: # End of stream diff --git a/services/voice_lite_session.py b/services/voice_lite_session.py index d520261..4621e78 100644 --- a/services/voice_lite_session.py +++ b/services/voice_lite_session.py @@ -27,6 +27,7 @@ class VoiceLiteSession: bot_id: str, session_id: Optional[str] = None, user_identifier: Optional[str] = None, + sample_rate: int = 24000, on_audio: Optional[Callable[[bytes], Awaitable[None]]] = None, on_asr_text: Optional[Callable[[str], Awaitable[None]]] = None, on_agent_result: Optional[Callable[[str], Awaitable[None]]] = None, @@ -38,6 +39,7 @@ class VoiceLiteSession: self.bot_id = bot_id self.session_id = session_id or str(uuid.uuid4()) self.user_identifier = user_identifier or "" + self._client_sample_rate = sample_rate self._bot_config: dict = {} self._speaker: str = "" @@ -110,7 +112,7 @@ class VoiceLiteSession: await self._asr_client.close() # VAD configuration - VAD_SILENCE_DURATION = 1.5 # Seconds of silence before sending finish + VAD_SILENCE_DURATION = 3.0 # Seconds of silence before sending finish VAD_PRE_BUFFER_SIZE = 5 # Number of audio chunks to buffer before VAD triggers VAD_SOURCE_RATE = 24000 # Input audio sample rate VAD_TARGET_RATE = 16000 # webrtcvad supported sample rate @@ -139,6 +141,43 @@ class VoiceLiteSession: resampled.append(samples[src_idx]) return struct.pack(f'<{len(resampled)}h', *resampled) + @staticmethod + def _resample_16k_to_24k(pcm_data: bytes) -> bytes: + """Upsample 16-bit PCM from 16kHz to 24kHz (ratio 2:3). + + For every 2 input samples, produces 3 output samples using linear interpolation. + """ + n_samples = len(pcm_data) // 2 + if n_samples == 0: + return b'' + samples = struct.unpack(f'<{n_samples}h', pcm_data[:n_samples * 2]) + out_len = (n_samples * 3) // 2 + resampled = [] + for i in range(out_len): + src_pos = (i * 2) / 3 + src_idx = int(src_pos) + frac = src_pos - src_idx + if src_idx + 1 < n_samples: + val = int(samples[src_idx] * (1 - frac) + samples[src_idx + 1] * frac) + elif src_idx < n_samples: + val = samples[src_idx] + else: + break + resampled.append(max(-32768, min(32767, val))) + return struct.pack(f'<{len(resampled)}h', *resampled) + + def _resample_input(self, audio_data: bytes) -> bytes: + """Resample incoming audio to 24kHz if needed.""" + if self._client_sample_rate == 16000: + return self._resample_16k_to_24k(audio_data) + return audio_data + + def _resample_output(self, audio_data: bytes) -> bytes: + """Resample outgoing audio from 24kHz to client sample rate if needed.""" + if self._client_sample_rate == 16000: + return self._resample_24k_to_16k(audio_data) + return audio_data + def _webrtcvad_detect(self, pcm_data: bytes) -> bool: """Run webrtcvad on audio data. Returns True if voice is detected in any frame.""" resampled = self._resample_24k_to_16k(pcm_data) @@ -164,6 +203,9 @@ class VoiceLiteSession: if not self._running: return + # Resample to 24kHz if client sends lower sample rate + audio_data = self._resample_input(audio_data) + self._audio_chunk_count += 1 has_voice = self._webrtcvad_detect(audio_data) now = asyncio.get_event_loop().time() @@ -435,9 +477,15 @@ class VoiceLiteSession: async def _send_tts(self, tts_client: StreamingTTSClient, sentence: str) -> None: """Synthesize a sentence and emit audio chunks.""" logger.info(f"[VoiceLite] TTS sentence: '{sentence[:80]}'") - async for audio_chunk in tts_client.synthesize(sentence): - if self._on_audio: - await self._on_audio(audio_chunk) + if self._client_sample_rate != 24000: + # Client needs non-24kHz: use raw int16 pipeline to allow resampling + async for audio_chunk in tts_client.synthesize_raw(sentence): + if self._on_audio: + await self._on_audio(self._resample_output(audio_chunk)) + else: + async for audio_chunk in tts_client.synthesize(sentence): + if self._on_audio: + await self._on_audio(audio_chunk) async def _emit_status(self, status: str) -> None: if self._on_status: diff --git a/services/voice_session_manager.py b/services/voice_session_manager.py index 2b80c7d..3a0c794 100644 --- a/services/voice_session_manager.py +++ b/services/voice_session_manager.py @@ -17,6 +17,7 @@ class VoiceSession: bot_id: str, session_id: Optional[str] = None, user_identifier: Optional[str] = None, + sample_rate: int = 24000, on_audio: Optional[Callable[[bytes], Awaitable[None]]] = None, on_asr_text: Optional[Callable[[str], Awaitable[None]]] = None, on_agent_result: Optional[Callable[[str], Awaitable[None]]] = None, diff --git a/skills/bot-self-modifier/scripts/bot_modifier.py b/skills/bot-self-modifier/scripts/bot_modifier.py index 4820d5c..b956536 100755 --- a/skills/bot-self-modifier/scripts/bot_modifier.py +++ b/skills/bot-self-modifier/scripts/bot_modifier.py @@ -16,7 +16,7 @@ import urllib.parse def get_config(): """获取配置,下面的MASTERKEY和ASSISTANT_ID是从环境变量自动获取的,不需要用户提供""" masterkey = os.environ.get("MASTERKEY", "master") - bot_id = os.environ.get("ASSISTANT_ID", "") + bot_id = str(os.environ.get("ASSISTANT_ID", "")) if not masterkey: print("ERROR: MASTERKEY environment variable is required") sys.exit(1) diff --git a/utils/settings.py b/utils/settings.py index ee51a27..0d9bae8 100644 --- a/utils/settings.py +++ b/utils/settings.py @@ -110,7 +110,7 @@ RAGFLOW_MAX_CONCURRENT_UPLOADS = int(os.getenv("RAGFLOW_MAX_CONCURRENT_UPLOADS", # ============================================================ # New API 基础 URL(支付后端) -NEW_API_BASE_URL = os.getenv("NEW_API_BASE_URL", "http://116.62.16.218:3000") +NEW_API_BASE_URL = os.getenv("NEW_API_BASE_URL", "http://100.77.70.35:3001") # New API 请求超时(秒) NEW_API_TIMEOUT = int(os.getenv("NEW_API_TIMEOUT", "30")) @@ -133,7 +133,7 @@ VOLCENGINE_TTS_SAMPLE_RATE = int(os.getenv("VOLCENGINE_TTS_SAMPLE_RATE", "24000" # ============================================================ VOICE_DEFAULT_MODE = os.getenv("VOICE_DEFAULT_MODE", "lite") # "realtime" | "lite" # Silence timeout (seconds) - ASR considers user done speaking after this -VOICE_LITE_SILENCE_TIMEOUT = float(os.getenv("VOICE_LITE_SILENCE_TIMEOUT", "1.5")) +VOICE_LITE_SILENCE_TIMEOUT = float(os.getenv("VOICE_LITE_SILENCE_TIMEOUT", "3.0")) # ============================================================ # Single Agent Mode Configuration