This commit is contained in:
朱潮 2025-09-18 23:34:55 +08:00
parent e6aa7f7be8
commit 53d53e4555
10 changed files with 311 additions and 31 deletions

BIN
.DS_Store vendored

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -8,11 +8,10 @@ import time
import uuid import uuid
import wave import wave
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Dict, Any from typing import Any, Dict, Optional
import pyaudio
import config import config
import pyaudio
from realtime_dialog_client import RealtimeDialogClient from realtime_dialog_client import RealtimeDialogClient
@ -96,6 +95,14 @@ class DialogSession:
self.is_user_querying = False self.is_user_querying = False
self.is_sending_chat_tts_text = False self.is_sending_chat_tts_text = False
self.audio_buffer = b'' self.audio_buffer = b''
self.is_playing_audio = False # 是否正在播放音频
self.audio_queue_lock = threading.Lock() # 音频队列锁
self.is_recording_paused = False # 录音是否被暂停
self.should_send_silence = False # 是否需要发送静音数据
self.silence_send_count = 0 # 需要发送的静音数据数量
self.pre_pause_time = 0 # 预暂停时间
self.last_recording_state = False # 上次录音状态
self.say_hello_completed = False # say hello 是否已完成
signal.signal(signal.SIGINT, self._keyboard_signal) signal.signal(signal.SIGINT, self._keyboard_signal)
self.audio_queue = queue.Queue() self.audio_queue = queue.Queue()
@ -105,7 +112,9 @@ class DialogSession:
AudioConfig(**config.output_audio_config) AudioConfig(**config.output_audio_config)
) )
# 初始化音频队列和输出流 # 初始化音频队列和输出流
print(f"输出音频配置: {config.output_audio_config}")
self.output_stream = self.audio_device.open_output_stream() self.output_stream = self.audio_device.open_output_stream()
print("音频输出流已打开")
# 启动播放线程 # 启动播放线程
self.is_recording = True self.is_recording = True
self.is_playing = True self.is_playing = True
@ -115,44 +124,149 @@ class DialogSession:
def _audio_player_thread(self): def _audio_player_thread(self):
"""音频播放线程""" """音频播放线程"""
audio_playing_timeout = 1.0 # 1秒没有音频数据认为播放结束
queue_check_interval = 0.1 # 每100ms检查一次队列状态
while self.is_playing: while self.is_playing:
try: try:
# 从队列获取音频数据 # 从队列获取音频数据
audio_data = self.audio_queue.get(timeout=1.0) audio_data = self.audio_queue.get(timeout=queue_check_interval)
if audio_data is not None: if audio_data is not None:
with self.audio_queue_lock:
# 第三重保险:播放开始时最终确认暂停状态
if not hasattr(self, 'last_audio_time') or not self.is_playing_audio:
# 从非播放状态进入播放状态
self.is_playing_audio = True
# 确保录音已暂停
if not self.is_recording_paused:
self.is_recording_paused = True
print("播放开始,最终确认暂停录音")
# 更新最后音频时间
self.last_audio_time = time.time()
# 播放音频数据
self.output_stream.write(audio_data) self.output_stream.write(audio_data)
except queue.Empty: except queue.Empty:
# 队列为空时等待一小段时间 # 队列为空,检查是否超时
time.sleep(0.1) current_time = time.time()
with self.audio_queue_lock:
if self.is_playing_audio:
if hasattr(self, 'last_audio_time') and current_time - self.last_audio_time > audio_playing_timeout:
# 超过1秒没有新音频认为播放结束
self.is_playing_audio = False
self.is_recording_paused = False
# 标记 say hello 完成
if hasattr(self, 'say_hello_completed') and not self.say_hello_completed:
self.say_hello_completed = True
print("say hello 音频播放完成")
print("音频播放超时,恢复录音")
# 直接发送静音数据,而不是在协程中发送
try:
silence_data = b'\x00' * config.input_audio_config["chunk"]
# 使用同步方式发送静音数据
# 这里我们设置一个标志,让主循环处理
self.silence_send_count = 2 # 播放超时时发送2组静音数据
self.should_send_silence = True
except Exception as e:
print(f"准备静音数据失败: {e}")
elif self.audio_queue.empty():
# 队列为空,但还没超时,继续等待
pass
time.sleep(0.01)
except Exception as e: except Exception as e:
print(f"音频播放错误: {e}") print(f"音频播放错误: {e}")
with self.audio_queue_lock:
self.is_playing_audio = False
self.is_recording_paused = False
time.sleep(0.1) time.sleep(0.1)
# 移除了静音检测函数,避免干扰正常的音频处理
async def _send_silence_on_playback_end(self):
"""播放结束时发送静音数据"""
try:
silence_data = b'\x00' * config.input_audio_config["chunk"]
await self.client.task_request(silence_data)
print("播放结束,已发送静音数据")
except Exception as e:
print(f"发送静音数据失败: {e}")
def _check_and_restore_recording(self):
"""检查并恢复录音状态"""
with self.audio_queue_lock:
if self.is_recording_paused and self.audio_queue.empty():
# 如果队列为空且录音被暂停,恢复录音
self.is_recording_paused = False
self.is_playing_audio = False
print("音频队列为空,自动恢复录音")
return True
return False
def handle_server_response(self, response: Dict[str, Any]) -> None: def handle_server_response(self, response: Dict[str, Any]) -> None:
if response == {}: if not response or response == {}:
return return
"""处理服务器响应""" """处理服务器响应"""
if response['message_type'] == 'SERVER_ACK' and isinstance(response.get('payload_msg'), bytes): message_type = response.get('message_type')
# print(f"\n接收到音频数据: {len(response['payload_msg'])} 字节") if message_type == 'SERVER_ACK' and isinstance(response.get('payload_msg'), bytes):
if self.is_sending_chat_tts_text: if self.is_sending_chat_tts_text:
return return
audio_data = response['payload_msg'] audio_data = response['payload_msg']
# 第二重保险:接收到音频数据时确认暂停状态
with self.audio_queue_lock:
was_not_playing = not self.is_playing_audio
if was_not_playing:
# 第一批音频数据到达,确保录音已暂停
self.is_playing_audio = True
if not self.is_recording_paused:
self.is_recording_paused = True
print("接收到首批音频数据,立即暂停录音")
else:
print("接收到音频数据,录音已暂停")
# 立即发送静音数据,确保管道清理
self.silence_send_count = 3 # 音频数据到达时发送3组静音数据
self.should_send_silence = True
print("服务器收到音频数据,立即清理录音管道")
if not self.is_audio_file_input: if not self.is_audio_file_input:
self.audio_queue.put(audio_data) self.audio_queue.put(audio_data)
self.audio_buffer += audio_data self.audio_buffer += audio_data
elif response['message_type'] == 'SERVER_FULL_RESPONSE': elif message_type == 'SERVER_FULL_RESPONSE':
print(f"服务器响应: {response}") print(f"服务器响应: {response}")
event = response.get('event') event = response.get('event')
payload_msg = response.get('payload_msg', {}) payload_msg = response.get('payload_msg', {})
if event == 450: # 第一重保险:服务器开始响应时立即预暂停录音
print(f"清空缓存音频: {response['session_id']}") if event in [450, 359, 152, 153]: # 这些事件表示服务器开始或结束响应
while not self.audio_queue.empty(): if event == 450:
try: print(f"清空缓存音频: {response['session_id']}")
self.audio_queue.get_nowait() while not self.audio_queue.empty():
except queue.Empty: try:
continue self.audio_queue.get_nowait()
self.is_user_querying = True except queue.Empty:
continue
self.is_user_querying = True
print("服务器准备接收用户输入")
# 预暂停录音,防止即将到来的音频回声
with self.audio_queue_lock:
if not self.is_recording_paused:
self.is_recording_paused = True
self.is_playing_audio = True # 同时设置播放状态,双重保险
self.pre_pause_time = time.time()
print("服务器开始响应,预暂停录音防止回声")
# 立即发送静音数据清理管道防止前1-2秒回声
print("预暂停期间立即发送静音数据清理管道")
# 设置批量静音发送,确保管道完全清理
self.silence_send_count = 8 # 增加到8组确保彻底清理
self.should_send_silence = True
# 强制重置录音状态
self.last_recording_state = True # 标记为已暂停
if event == 350 and self.is_sending_chat_tts_text and payload_msg.get("tts_type") in ["chat_tts_text", "external_rag"]: if event == 350 and self.is_sending_chat_tts_text and payload_msg.get("tts_type") in ["chat_tts_text", "external_rag"]:
while not self.audio_queue.empty(): while not self.audio_queue.empty():
@ -164,11 +278,22 @@ class DialogSession:
if event == 459: if event == 459:
self.is_user_querying = False self.is_user_querying = False
if random.randint(0, 100000)%1 == 0: # 服务器完成响应,立即恢复录音
self.is_sending_chat_tts_text = True with self.audio_queue_lock:
asyncio.create_task(self.trigger_chat_tts_text()) was_paused = self.is_recording_paused
asyncio.create_task(self.trigger_chat_rag_text()) self.is_recording_paused = False
elif response['message_type'] == 'SERVER_ERROR': self.is_playing_audio = False
if was_paused:
print("服务器响应完成,立即恢复录音")
# 设置标志发送静音数据
self.silence_send_count = 2 # 响应完成时发送2组静音数据
self.should_send_silence = True
print("服务器完成响应,等待用户输入")
#if random.randint(0, 100000)%1 == 0:
# self.is_sending_chat_tts_text = True
#asyncio.create_task(self.trigger_chat_tts_text())
#asyncio.create_task(self.trigger_chat_rag_text())
elif message_type == 'SERVER_ERROR':
print(f"服务器错误: {response['payload_msg']}") print(f"服务器错误: {response['payload_msg']}")
raise Exception("服务器错误") raise Exception("服务器错误")
@ -220,7 +345,21 @@ class DialogSession:
if not self.say_hello_over_event.is_set(): if not self.say_hello_over_event.is_set():
print(f"receive tts sayhello ended event") print(f"receive tts sayhello ended event")
self.say_hello_over_event.set() self.say_hello_over_event.set()
# 对于音频模式say hello 音频播放即将开始
# 确保录音保持暂停状态
if self.mod == "audio":
with self.audio_queue_lock:
self.is_recording_paused = True
self.is_playing_audio = True
print("say hello 音频即将开始,确保录音暂停")
if self.mod == "text": if self.mod == "text":
# 文本模式下 say hello 完成,恢复录音状态
with self.audio_queue_lock:
if self.is_recording_paused:
self.is_recording_paused = False
print("文本模式say hello 完成,恢复录音")
print("请输入内容:") print("请输入内容:")
except asyncio.CancelledError: except asyncio.CancelledError:
@ -235,6 +374,28 @@ class DialogSession:
await self.process_audio_file_input(self.audio_file_path) await self.process_audio_file_input(self.audio_file_path)
async def process_text_input(self) -> None: async def process_text_input(self) -> None:
# 程序启动后先静音2秒确保系统稳定
print("文本模式程序启动先静音2秒确保系统稳定...")
with self.audio_queue_lock:
self.is_recording_paused = True
self.is_playing_audio = True # 标记正在播放
# 发送2秒静音数据确保管道清理
silence_data = b'\x00' * config.input_audio_config["chunk"]
for i in range(20): # 2秒 = 20 * 100ms
await self.client.task_request(silence_data)
await asyncio.sleep(0.1)
if i % 10 == 0: # 每秒打印一次进度
print(f"文本模式:静音中... {i//10 + 1}/2秒")
print("文本模式:静音完成,准备 say hello")
# say hello 前确保录音仍处于暂停状态
with self.audio_queue_lock:
self.is_recording_paused = True
self.is_playing_audio = True # 标记正在播放
print("文本模式:准备 say hello确保录音暂停")
await self.client.say_hello() await self.client.say_hello()
await self.say_hello_over_event.wait() await self.say_hello_over_event.wait()
@ -310,20 +471,131 @@ class DialogSession:
await self.client.task_request(silence_data) await self.client.task_request(silence_data)
async def process_microphone_input(self) -> None: async def process_microphone_input(self) -> None:
await self.client.say_hello()
await self.say_hello_over_event.wait()
await self.client.chat_text_query("你好,我也叫豆包")
"""处理麦克风输入""" """处理麦克风输入"""
stream = self.audio_device.open_input_stream() stream = self.audio_device.open_input_stream()
print("已打开麦克风,请讲话...") print("已打开麦克风,请讲话...")
print("音频处理已启动,播放时将发送静音数据避免回声")
# 程序启动后先静音2秒确保系统稳定
print("程序启动先静音2秒确保系统稳定...")
with self.audio_queue_lock:
self.is_recording_paused = True
self.is_playing_audio = True # 标记正在播放
# 发送2秒静音数据确保管道清理
silence_data = b'\x00' * config.input_audio_config["chunk"]
for i in range(20): # 2秒 = 20 * 100ms
await self.client.task_request(silence_data)
await asyncio.sleep(0.1)
if i % 10 == 0: # 每秒打印一次进度
print(f"静音中... {i//10 + 1}/2秒")
print("静音完成,准备 say hello")
# say hello 前确保录音仍处于暂停状态
with self.audio_queue_lock:
self.is_recording_paused = True
self.is_playing_audio = True # 标记正在播放
print("准备 say hello确保录音暂停")
await self.client.say_hello()
await self.say_hello_over_event.wait()
# 注意:不立即恢复录音状态,等待音频实际播放完成
# 录音状态将由音频播放线程在播放超时后自动恢复
print("say hello 请求完成,等待音频播放结束...")
# 创建静音数据
silence_data = b'\x00' * config.input_audio_config["chunk"]
last_silence_time = time.time()
# say hello 期间的特殊处理:确保完全静音
say_hello_silence_sent = False
while self.is_recording: while self.is_recording:
try: try:
current_time = time.time()
# say hello 期间强制静音处理
with self.audio_queue_lock:
is_currently_playing = self.is_playing_audio
if is_currently_playing or not self.say_hello_completed:
# 如果正在播放或者 say hello 未完成,发送静音数据
if current_time - last_silence_time > 0.05: # 每50ms发送一次
await self.client.task_request(silence_data)
last_silence_time = current_time
if not self.say_hello_completed and not is_currently_playing:
print("say hello 期间发送静音数据")
await asyncio.sleep(0.01)
continue
# 检查是否需要发送静音数据(由播放线程触发)- 最高优先级
if self.should_send_silence:
with self.audio_queue_lock:
self.should_send_silence = False
# 获取需要发送的静音数据数量
count = self.silence_send_count
self.silence_send_count = 0
# 批量发送静音数据
if count > 1:
print(f"立即清理录音管道,批量发送{count}组静音数据")
for i in range(count):
await self.client.task_request(silence_data)
await asyncio.sleep(0.005) # 短暂间隔确保发送成功
else:
await self.client.task_request(silence_data)
print("立即清理录音管道,发送静音数据")
last_silence_time = current_time
await asyncio.sleep(0.01)
continue
# 检查录音是否被暂停
with self.audio_queue_lock:
should_pause_recording = self.is_recording_paused
# 检查是否刚刚进入暂停状态
just_paused = should_pause_recording and hasattr(self, 'last_recording_state') and self.last_recording_state != should_pause_recording
self.last_recording_state = should_pause_recording
if should_pause_recording:
# 播放期间:完全停止录音,只发送静音数据
if just_paused or current_time - last_silence_time > 0.1: # 刚暂停或每100ms发送一次静音数据
await self.client.task_request(silence_data)
last_silence_time = current_time
if just_paused:
print("刚进入暂停状态,立即发送静音数据清理管道")
# 每5秒打印一次状态避免过多日志
elif not hasattr(self, 'last_silence_log_time') or current_time - self.last_silence_log_time > 5:
print("正在播放音频,发送静音数据中...")
self.last_silence_log_time = current_time
await asyncio.sleep(0.01)
continue
# 非播放期间:正常录音
last_silence_time = current_time
# 添加exception_on_overflow=False参数来忽略溢出错误 # 添加exception_on_overflow=False参数来忽略溢出错误
audio_data = stream.read(config.input_audio_config["chunk"], exception_on_overflow=False) audio_data = stream.read(config.input_audio_config["chunk"], exception_on_overflow=False)
# 在发送前再次检查是否应该发送静音数据(最后一道防线)
with self.audio_queue_lock:
if self.is_recording_paused or self.is_playing_audio:
# 如果处于暂停状态,丢弃这个音频数据并发送静音
save_input_pcm_to_wav(silence_data, "input.pcm") # 保存静音数据用于调试
await self.client.task_request(silence_data)
# 每50次打印一次日志避免过多输出
if not hasattr(self, 'pause_discard_count') or self.pause_discard_count % 50 == 0:
print(f"暂停期间丢弃音频数据,发送静音数据 (次数: {getattr(self, 'pause_discard_count', 0) + 1})")
self.pause_discard_count = getattr(self, 'pause_discard_count', 0) + 1
await asyncio.sleep(0.01)
continue
# 直接发送所有音频数据,不进行静音检测
save_input_pcm_to_wav(audio_data, "input.pcm") save_input_pcm_to_wav(audio_data, "input.pcm")
await self.client.task_request(audio_data) await self.client.task_request(audio_data)
await asyncio.sleep(0.01) # 避免CPU过度使用 await asyncio.sleep(0.01) # 避免CPU过度使用
except Exception as e: except Exception as e:
print(f"读取麦克风数据出错: {e}") print(f"读取麦克风数据出错: {e}")

View File

@ -1,12 +1,13 @@
import uuid import uuid
import pyaudio import pyaudio
# 配置信息 # 配置信息
ws_connect_config = { ws_connect_config = {
"base_url": "wss://openspeech.bytedance.com/api/v3/realtime/dialogue", "base_url": "wss://openspeech.bytedance.com/api/v3/realtime/dialogue",
"headers": { "headers": {
"X-Api-App-ID": "", "X-Api-App-ID": "8718217928",
"X-Api-Access-Key": "", "X-Api-Access-Key": "ynJMX-5ix1FsJvswC9KTNlGUdubcchqc",
"X-Api-Resource-Id": "volc.speech.dialog", # 固定值 "X-Api-Resource-Id": "volc.speech.dialog", # 固定值
"X-Api-App-Key": "PlgvMymc7f3tQnJ6", # 固定值 "X-Api-App-Key": "PlgvMymc7f3tQnJ6", # 固定值
"X-Api-Connect-Id": str(uuid.uuid4()), "X-Api-Connect-Id": str(uuid.uuid4()),

BIN
doubao/input.pcm Normal file

Binary file not shown.

BIN
doubao/output.pcm Normal file

Binary file not shown.

View File

@ -22,12 +22,19 @@ class RealtimeDialogClient:
async def connect(self) -> None: async def connect(self) -> None:
"""建立WebSocket连接""" """建立WebSocket连接"""
print(f"url: {self.config['base_url']}, headers: {self.config['headers']}") print(f"url: {self.config['base_url']}, headers: {self.config['headers']}")
# For older websockets versions, use additional_headers instead of extra_headers
self.ws = await websockets.connect( self.ws = await websockets.connect(
self.config['base_url'], self.config['base_url'],
extra_headers=self.config['headers'], additional_headers=self.config['headers'],
ping_interval=None ping_interval=None
) )
self.logid = self.ws.response_headers.get("X-Tt-Logid") # In older websockets versions, response headers are accessed differently
if hasattr(self.ws, 'response_headers'):
self.logid = self.ws.response_headers.get("X-Tt-Logid")
elif hasattr(self.ws, 'headers'):
self.logid = self.ws.headers.get("X-Tt-Logid")
else:
self.logid = "unknown"
print(f"dialog server response logid: {self.logid}") print(f"dialog server response logid: {self.logid}")
# StartConnection request # StartConnection request