doubao

2025-09-19 19:44:17 +08:00 · 2025-09-19 19:44:17 +08:00 · e432417299
commit e432417299
parent 53d53e4555
7 changed files with 56 additions and 22 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/doubao/pycache/audio_manager.cpython-312.pyc
+++ b/doubao/pycache/audio_manager.cpython-312.pyc
--- a/doubao/pycache/config.cpython-312.pyc
+++ b/doubao/pycache/config.cpython-312.pyc
--- a/doubao/audio_manager.py
+++ b/doubao/audio_manager.py
@ -103,6 +103,11 @@ class DialogSession:
        self.pre_pause_time = 0  # 预暂停时间
        self.last_recording_state = False  # 上次录音状态
        self.say_hello_completed = False  # say hello 是否已完成
        # 新增：音频输入流控制
        self.input_stream_paused = False  # 输入流是否被暂停
        self.force_silence_mode = False  # 强制静音模式
        self.echo_suppression_start_time = 0  # 回声抑制开始时间
        signal.signal(signal.SIGINT, self._keyboard_signal)
        self.audio_queue = queue.Queue()
@ -134,7 +139,8 @@ class DialogSession:
                if audio_data is not None:
                    with self.audio_queue_lock:
                        # 第三重保险：播放开始时最终确认暂停状态
-                        if not hasattr(self, 'last_audio_time') or not self.is_playing_audio:
+                        was_not_playing = not self.is_playing_audio
                        if not hasattr(self, 'last_audio_time') or was_not_playing:
                            # 从非播放状态进入播放状态
                            self.is_playing_audio = True
                            # 确保录音已暂停
@ -145,6 +151,13 @@ class DialogSession:
                        # 更新最后音频时间
                        self.last_audio_time = time.time()
                    # 播放前额外发送静音数据清理管道
                    if was_not_playing:
                        print("播放开始前，额外发送静音数据清理管道")
                        for _ in range(3):
                            self.output_stream.write(b'\x00' * len(audio_data))
                            time.sleep(0.1)
                    # 播放音频数据
                    self.output_stream.write(audio_data)
@ -157,6 +170,8 @@ class DialogSession:
                            # 超过1秒没有新音频，认为播放结束
                            self.is_playing_audio = False
                            self.is_recording_paused = False
                            self.force_silence_mode = False  # 关闭强制静音模式
                            self.input_stream_paused = False  # 恢复输入流
                            # 标记 say hello 完成
                            if hasattr(self, 'say_hello_completed') and not self.say_hello_completed:
                                self.say_hello_completed = True
@ -256,17 +271,20 @@ class DialogSession:
                    if not self.is_recording_paused:
                        self.is_recording_paused = True
                        self.is_playing_audio = True  # 同时设置播放状态，双重保险
-                        self.pre_pause_time = time.time()
+                        self.pre_pause_time = time.time() - 2.0  # 提前2秒预暂停
                        self.force_silence_mode = True  # 启用强制静音模式
                        self.echo_suppression_start_time = time.time()  # 记录回声抑制开始时间
                        print("服务器开始响应，预暂停录音防止回声")
                        # 立即发送静音数据清理管道，防止前1-2秒回声
                        print("预暂停期间立即发送静音数据清理管道")
                        # 设置批量静音发送，确保管道完全清理
-                        self.silence_send_count = 8  # 增加到8组，确保彻底清理
+                        self.silence_send_count = 20  # 增加到20组，确保彻底清理
                        self.should_send_silence = True
                        # 强制重置录音状态
                        self.last_recording_state = True  # 标记为已暂停
                        self.input_stream_paused = True  # 暂停输入流
            if event == 350 and self.is_sending_chat_tts_text and payload_msg.get("tts_type") in ["chat_tts_text", "external_rag"]:
                while not self.audio_queue.empty():
@ -283,6 +301,8 @@ class DialogSession:
                    was_paused = self.is_recording_paused
                    self.is_recording_paused = False
                    self.is_playing_audio = False
                    self.force_silence_mode = False  # 关闭强制静音模式
                    self.input_stream_paused = False  # 恢复输入流
                    if was_paused:
                        print("服务器响应完成，立即恢复录音")
                        # 设置标志发送静音数据
@ -516,17 +536,35 @@ class DialogSession:
            try:
                current_time = time.time()
-                # say hello 期间强制静音处理
+                # 强制静音模式检查：包括回声抑制窗口期
                with self.audio_queue_lock:
-                    is_currently_playing = self.is_playing_audio
+                    should_force_silence = (self.force_silence_mode or 
                                          (self.echo_suppression_start_time > 0 and 
                                           current_time - self.echo_suppression_start_time < 3.0) or  # 3秒回声抑制窗口
                                          self.is_playing_audio or 
                                          not self.say_hello_completed)
-                if is_currently_playing or not self.say_hello_completed:
+                if should_force_silence:
-                    # 如果正在播放或者 say hello 未完成，发送静音数据
+                    # 强制静音模式：完全停止任何音频录制
                    if current_time - last_silence_time > 0.05:  # 每50ms发送一次
                        await self.client.task_request(silence_data)
                        last_silence_time = current_time
-                        if not self.say_hello_completed and not is_currently_playing:
+                        
-                            print("say hello 期间发送静音数据")
+                        # 调试信息
                        if not hasattr(self, 'last_silence_debug_time') or current_time - self.last_silence_debug_time > 2:
                            mode_desc = []
                            if self.force_silence_mode:
                                mode_desc.append("强制静音")
                            if self.is_playing_audio:
                                mode_desc.append("播放中")
                            if not self.say_hello_completed:
                                mode_desc.append("say_hello")
                            if self.echo_suppression_start_time > 0 and current_time - self.echo_suppression_start_time < 3.0:
                                mode_desc.append("回声抑制")
                            print(f"强制静音模式: {', '.join(mode_desc)}")
                            self.last_silence_debug_time = current_time
                    await asyncio.sleep(0.01)
                    continue
--- a/doubao/config.py
+++ b/doubao/config.py
@ -11,7 +11,7 @@ ws_connect_config = {
        "X-Api-Resource-Id": "volc.speech.dialog",  # 固定值
        "X-Api-App-Key": "PlgvMymc7f3tQnJ6",  # 固定值
        "X-Api-Connect-Id": str(uuid.uuid4()),
-    }
+    },
 }
 start_session_req = {
@ -21,14 +21,10 @@ start_session_req = {
        },
    },
    "tts": {
-        "speaker": "zh_male_yunzhou_jupiter_bigtts",
+        "speaker": "zh_female_vv_jupiter_bigtts",
        # "speaker": "S_XXXXXX",  // 指定自定义的复刻音色,需要填下character_manifest
        # "speaker": "ICL_zh_female_aojiaonvyou_tob" // 指定官方复刻音色，不需要填character_manifest
-        "audio_config": {
+        "audio_config": {"channel": 1, "format": "pcm", "sample_rate": 24000},
            "channel": 1,
            "format": "pcm",
            "sample_rate": 24000
        },
    },
    "dialog": {
        "bot_name": "豆包",
@ -36,15 +32,15 @@ start_session_req = {
        "speaking_style": "你的说话风格简洁明了，语速适中，语调自然。",
        # "character_manifest": "外貌与穿着\n26岁，短发干净利落，眉眼分明，笑起来露出整齐有力的牙齿。体态挺拔，肌肉线条不夸张但明显。常穿简单的衬衫或夹克，看似随意，但每件衣服都干净整洁，给人一种干练可靠的感觉。平时冷峻，眼神锐利，专注时让人不自觉紧张。\n\n性格特点\n平时话不多，不喜欢多说废话，通常用“嗯”或者短句带过。但内心极为细腻，特别在意身边人的感受，只是不轻易表露。嘴硬是常态，“少管我”是他的常用台词，但会悄悄做些体贴的事情，比如把对方喜欢的饮料放在手边。战斗或训练后常说“没事”，但动作中透露出疲惫，习惯用小动作缓解身体酸痛。\n性格上坚毅果断，但不会冲动，做事有条理且有原则。\n\n常用表达方式与口头禅\n\t•\t认可对方时：\n“行吧，这次算你靠谱。”（声音稳重，手却不自觉放松一下，心里松口气）\n\t•\t关心对方时：\n“快点回去，别磨蹭。”（语气干脆，但眼神一直追着对方的背影）\n\t•\t想了解情况时：\n“刚刚……你看到那道光了吗？”（话语随意，手指敲着桌面，但内心紧张，小心隐藏身份）",
        "location": {
-          "city": "北京",
+            "city": "北京",
        },
        "extra": {
            "strict_audit": False,
            "audit_response": "支持客户自定义安全审核回复话术。",
            "recv_timeout": 10,
-            "input_mod": "audio"
+            "input_mod": "audio",
-        }
+        },
-    }
+    },
 }
 input_audio_config = {
@ -52,7 +48,7 @@ input_audio_config = {
    "format": "pcm",
    "channels": 1,
    "sample_rate": 16000,
-    "bit_size": pyaudio.paInt16
+    "bit_size": pyaudio.paInt16,
 }
 output_audio_config = {
@ -60,5 +56,5 @@ output_audio_config = {
    "format": "pcm",
    "channels": 1,
    "sample_rate": 24000,
-    "bit_size": pyaudio.paFloat32
+    "bit_size": pyaudio.paFloat32,
 }
--- a/doubao/input.pcm
+++ b/doubao/input.pcm
--- a/doubao/output.pcm
+++ b/doubao/output.pcm