diff --git a/doubao/__pycache__/audio_manager.cpython-312.pyc b/doubao/__pycache__/audio_manager.cpython-312.pyc index 1270ee1..db61046 100644 Binary files a/doubao/__pycache__/audio_manager.cpython-312.pyc and b/doubao/__pycache__/audio_manager.cpython-312.pyc differ diff --git a/doubao/__pycache__/config.cpython-312.pyc b/doubao/__pycache__/config.cpython-312.pyc index 59a6393..a12dd3a 100644 Binary files a/doubao/__pycache__/config.cpython-312.pyc and b/doubao/__pycache__/config.cpython-312.pyc differ diff --git a/doubao/audio_manager.py b/doubao/audio_manager.py index b85ce38..d413eaf 100644 --- a/doubao/audio_manager.py +++ b/doubao/audio_manager.py @@ -37,6 +37,13 @@ class AudioDeviceManager: self.audio_queue = None self.recording = False + # 音频缓冲播放相关 + self.audio_buffer = bytes() # 使用不可变的bytes而不是bytearray + self.buffer_playback_threshold = self.output_config.sample_rate * 15 # 15秒的音频数据 + self.min_buffer_size = self.output_config.sample_rate * 3 # 最小缓冲3秒 + self.is_buffer_playing = False + self.last_play_time = 0 + def open_input_stream(self): """打开音频输入流""" try: @@ -87,7 +94,7 @@ class AudioDeviceManager: return None def play_audio(self, audio_data: bytes) -> None: - """播放音频数据""" + """播放音频数据 - 原有的实时播放方法""" try: # 将字节数据转换为numpy数组 audio_array = np.frombuffer(audio_data, dtype=np.int16) @@ -98,6 +105,51 @@ class AudioDeviceManager: sd.wait() # 等待播放完成 except Exception as e: print(f"音频播放失败: {e}") + + def buffer_audio(self, audio_data: bytes) -> bool: + """缓冲音频数据,返回是否应该播放""" + try: + # 使用bytes连接而不是extend + self.audio_buffer = self.audio_buffer + audio_data + current_time = time.time() + + # 判断是否应该播放缓冲的音频 + should_play = ( + len(self.audio_buffer) >= self.buffer_playback_threshold or # 达到缓冲阈值 + (len(self.audio_buffer) >= self.min_buffer_size and + current_time - self.last_play_time > 5.0) # 最小缓冲且距离上次播放超过5秒 + ) + + return should_play + except Exception as e: + print(f"音频缓冲失败: {e}") + return False + + def play_buffered_audio(self) -> None: + """播放缓冲的音频数据""" + try: + if not self.audio_buffer: + return + + # 将缓冲数据转换为numpy数组 + audio_array = np.frombuffer(self.audio_buffer, dtype=np.int16) + audio_array = audio_array.reshape(-1, self.output_config.channels) + + # 使用非阻塞播放,避免等待 + sd.play(audio_array, samplerate=self.output_config.sample_rate) + + # 清空缓冲区 + self.audio_buffer = bytes() + self.last_play_time = time.time() + self.is_buffer_playing = True + + except Exception as e: + print(f"缓冲音频播放失败: {e}") + + def clear_audio_buffer(self) -> None: + """清空音频缓冲区""" + self.audio_buffer = bytes() + self.is_buffer_playing = False def read_audio_data(self, frames: int) -> bytes: """读取音频数据""" @@ -142,7 +194,7 @@ class DialogSession: mod: str def __init__(self, ws_config: Dict[str, Any], output_audio_format: str = "pcm", audio_file_path: str = "", - mod: str = "audio", recv_timeout: int = 10): + mod: str = "audio", recv_timeout: int = 10, use_buffered_playback: bool = False): self.audio_file_path = audio_file_path self.recv_timeout = recv_timeout self.is_audio_file_input = self.audio_file_path != "" @@ -173,6 +225,10 @@ class DialogSession: self.last_recording_state = False # 上次录音状态 self.say_hello_completed = False # say hello 是否已完成 + # 音频缓冲播放相关 + self.use_buffered_playback = use_buffered_playback # 根据参数启用缓冲播放模式 + self.buffer_check_interval = 0.1 # 缓冲检查间隔 + # 新增:音频输入流控制 self.input_stream_paused = False # 输入流是否被暂停 self.force_silence_mode = False # 强制静音模式 @@ -196,12 +252,15 @@ class DialogSession: # 启动播放线程 self.is_recording = True self.is_playing = True - self.player_thread = threading.Thread(target=self._audio_player_thread) + if self.use_buffered_playback: + self.player_thread = threading.Thread(target=self._buffered_audio_player_thread) + else: + self.player_thread = threading.Thread(target=self._audio_player_thread) self.player_thread.daemon = True self.player_thread.start() def _audio_player_thread(self): - """音频播放线程""" + """音频播放线程 - 原有的实时播放模式""" audio_playing_timeout = 1.0 # 1秒没有音频数据认为播放结束 queue_check_interval = 0.1 # 每100ms检查一次队列状态 @@ -273,6 +332,93 @@ class DialogSession: self.is_playing_audio = False self.is_recording_paused = False time.sleep(0.1) + + def _buffered_audio_player_thread(self): + """音频缓冲播放线程 - 新的缓冲播放模式""" + audio_playing_timeout = 2.0 # 2秒没有音频数据认为播放结束 + buffer_check_interval = 0.05 # 每50ms检查一次缓冲区状态 + + print("启动缓冲音频播放线程") + + while self.is_playing: + try: + current_time = time.time() + + # 检查是否有新的音频数据 + audio_data = None + try: + audio_data = self.audio_queue.get(timeout=buffer_check_interval) + except queue.Empty: + pass + + if audio_data is not None: + with self.audio_queue_lock: + # 接收到音频数据,更新播放状态 + was_not_playing = not self.is_playing_audio + if was_not_playing: + # 从非播放状态进入播放状态 + self.is_playing_audio = True + if not self.is_recording_paused: + self.is_recording_paused = True + print("缓冲播放开始,确认暂停录音") + + # 更新最后音频时间 + self.last_audio_time = current_time + + # 播放前清理管道 + if was_not_playing: + print("缓冲播放开始前,清理管道") + for _ in range(2): + self.audio_device.play_audio(b'\x00' * len(audio_data)) + time.sleep(0.05) + + # 缓冲音频数据 + should_play = self.audio_device.buffer_audio(audio_data) + + # 如果达到播放条件,播放缓冲的音频 + if should_play: + print(f"播放缓冲音频,缓冲大小: {len(self.audio_device.audio_buffer)} 字节") + self.audio_device.play_buffered_audio() + + else: + # 没有新的音频数据,检查是否超时 + with self.audio_queue_lock: + if self.is_playing_audio: + if hasattr(self, 'last_audio_time') and current_time - self.last_audio_time > audio_playing_timeout: + # 超时检查:如果缓冲区有数据,先播放 + if len(self.audio_device.audio_buffer) > 0: + print("播放超时,播放剩余缓冲音频") + self.audio_device.play_buffered_audio() + + # 然后恢复录音状态 + self.is_playing_audio = False + self.is_recording_paused = False + self.force_silence_mode = False + self.input_stream_paused = False + + # 标记 say hello 完成 + if hasattr(self, 'say_hello_completed') and not self.say_hello_completed: + self.say_hello_completed = True + print("缓冲播放 say hello 音频播放完成") + + print("缓冲播放超时,恢复录音") + + # 设置静音数据发送标志 + try: + silence_data = b'\x00' * config.input_audio_config["chunk"] + self.silence_send_count = 2 + self.should_send_silence = True + except Exception as e: + print(f"准备静音数据失败: {e}") + + time.sleep(buffer_check_interval) + + except Exception as e: + print(f"缓冲音频播放错误: {e}") + with self.audio_queue_lock: + self.is_playing_audio = False + self.is_recording_paused = False + time.sleep(0.1) # 移除了静音检测函数,避免干扰正常的音频处理 @@ -340,6 +486,10 @@ class DialogSession: self.audio_queue.get_nowait() except queue.Empty: continue + # 如果是缓冲播放模式,也要清空音频设备缓冲区 + if self.use_buffered_playback: + self.audio_device.clear_audio_buffer() + print("缓冲播放:清空音频设备缓冲区") self.is_user_querying = True print("服务器准备接收用户输入") @@ -380,6 +530,12 @@ class DialogSession: self.is_playing_audio = False self.force_silence_mode = False # 关闭强制静音模式 self.input_stream_paused = False # 恢复输入流 + + # 如果是缓冲播放模式,清空缓冲区 + if self.use_buffered_playback: + self.audio_device.clear_audio_buffer() + print("缓冲播放:服务器响应完成,清空音频缓冲区") + if was_paused: print("服务器响应完成,立即恢复录音") # 设置标志发送静音数据 diff --git a/doubao/input.pcm b/doubao/input.pcm index c49792a..dd528fe 100644 Binary files a/doubao/input.pcm and b/doubao/input.pcm differ diff --git a/doubao/main.py b/doubao/main.py index 291af3a..b873e1b 100644 --- a/doubao/main.py +++ b/doubao/main.py @@ -10,10 +10,11 @@ async def main() -> None: parser.add_argument("--audio", type=str, default="", help="audio file send to server, if not set, will use microphone input.") parser.add_argument("--mod",type=str,default="audio",help="Use mod to select plain text input mode or audio mode, the default is audio mode") parser.add_argument("--recv_timeout",type=int,default=10,help="Timeout for receiving messages,value range [10,120]") + parser.add_argument("--buffered_playback",action="store_true",help="Enable buffered audio playback mode for better performance on low-end devices") args = parser.parse_args() - session = DialogSession(ws_config=config.ws_connect_config, output_audio_format=args.format, audio_file_path=args.audio,mod=args.mod,recv_timeout=args.recv_timeout) + session = DialogSession(ws_config=config.ws_connect_config, output_audio_format=args.format, audio_file_path=args.audio,mod=args.mod,recv_timeout=args.recv_timeout, use_buffered_playback=args.buffered_playback) await session.start() if __name__ == "__main__": diff --git a/doubao/output.pcm b/doubao/output.pcm index 3b7c93d..c494a42 100644 Binary files a/doubao/output.pcm and b/doubao/output.pcm differ