From 70c42eca1530d95eca582d754f9ec6d918b88e8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Sat, 20 Sep 2025 11:07:54 +0800 Subject: [PATCH] =?UTF-8?q?=E6=BF=80=E8=BF=9B=E6=80=A7=E8=83=BD=E4=BC=98?= =?UTF-8?q?=E5=8C=96=EF=BC=9A=E5=A4=A7=E5=B9=85=E9=99=8D=E4=BD=8E=E6=A0=91?= =?UTF-8?q?=E8=8E=93=E6=B4=BE3B=E5=BB=B6=E8=BF=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 音频参数:8kHz采样率,4096块大小(4倍) - 激进模式:直接处理,跳过部分识别结果 - 缓冲优化:5个块缓冲区,0.2秒处理间隔 - 禁用词级识别:提升Vosk处理速度 - 实时延迟监控:显示音频处理延迟 - 预期效果:从10秒延迟降低到<1秒 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- simple_wake_and_record.py | 247 ++++++++++++++++++++++++++------------ 1 file changed, 173 insertions(+), 74 deletions(-) diff --git a/simple_wake_and_record.py b/simple_wake_and_record.py index 23d9c3e..2e38396 100644 --- a/simple_wake_and_record.py +++ b/simple_wake_and_record.py @@ -35,11 +35,12 @@ class SimpleWakeAndRecord: self.stream = None self.running = False - # 音频参数 - 优化为树莓派3B + # 音频参数 - 激进优化为树莓派3B self.FORMAT = pyaudio.paInt16 self.CHANNELS = 1 self.RATE = 8000 # 从16kHz降至8kHz,减少50%数据处理量 - self.CHUNK_SIZE = 2048 # 增大块大小,减少处理次数 + self.CHUNK_SIZE = 4096 # 进一步增大块大小到4KB,大幅减少处理次数 + self.AGGRESSIVE_MODE = True # 激进优化模式 # 录音相关 self.recording = False @@ -48,18 +49,25 @@ class SimpleWakeAndRecord: self.recording_start_time = None self.recording_recognizer = None # 录音时专用的识别器 - # 性能优化相关 + # 性能优化相关 - 激进优化 self.audio_buffer = [] # 音频缓冲区 - self.buffer_size = 10 # 缓冲区大小(块数) + self.buffer_size = 5 # 减小缓冲区大小,减少内存使用 self.last_process_time = time.time() # 上次处理时间 - self.process_interval = 0.5 # 处理间隔(秒) - self.batch_process_size = 5 # 批处理大小 + self.process_interval = 0.2 # 缩短处理间隔,提高响应速度 + self.batch_process_size = 3 # 减少批处理大小,更快处理 + self.skip_partial_results = True # 跳过部分识别结果,只处理最终结果 # 性能监控 self.process_count = 0 self.avg_process_time = 0 self.last_monitor_time = time.time() - self.monitor_interval = 5.0 # 监控间隔(秒) + self.monitor_interval = 3.0 # 缩短监控间隔 + + # 延迟监控 + self.audio_receive_times = [] # 音频接收时间戳 + self.process_start_times = [] # 处理开始时间 + self.latency_samples = [] # 延迟样本 + self.max_latency_samples = 10 # 最大延迟样本数 # 阈值 self.text_silence_threshold = 3.0 # 3秒没有识别到文字就结束 @@ -79,11 +87,21 @@ class SimpleWakeAndRecord: print(f"模型路径不存在: {self.model_path}") return + print(f"🔄 正在加载模型,这可能需要一些时间...") + start_time = time.time() + self.model = Model(self.model_path) self.recognizer = KaldiRecognizer(self.model, self.RATE) - self.recognizer.SetWords(True) - print(f"✅ Vosk 模型加载成功") + # 激进模式:禁用词级识别以提高性能 + if self.AGGRESSIVE_MODE: + self.recognizer.SetWords(False) + print(f"📉 激进模式:已禁用词级识别以提高性能") + else: + self.recognizer.SetWords(True) + + load_time = time.time() - start_time + print(f"✅ Vosk 模型加载成功 (耗时: {load_time:.2f}s)") except Exception as e: print(f"模型初始化失败: {e}") @@ -142,6 +160,7 @@ class SimpleWakeAndRecord: # 记录处理开始时间 start_time = time.time() + self.process_start_times.append(start_time) # 取出批处理数据 batch_data = self.audio_buffer[:self.batch_process_size] @@ -168,9 +187,48 @@ class SimpleWakeAndRecord: current_time = time.time() if current_time - self.last_monitor_time >= self.monitor_interval: buffer_usage = len(self.audio_buffer) / self.buffer_size * 100 - print(f"\n📊 性能监控 | 处理次数: {self.process_count} | 平均处理时间: {self.avg_process_time:.3f}s | 缓冲区使用: {buffer_usage:.1f}%") + + # 计算平均延迟 + avg_latency = 0 + if self.latency_samples: + avg_latency = sum(self.latency_samples) / len(self.latency_samples) + + print(f"\n📊 性能监控 | 处理次数: {self.process_count} | 平均处理时间: {self.avg_process_time:.3f}s | 缓冲区使用: {buffer_usage:.1f}% | 平均延迟: {avg_latency:.2f}s") self.last_monitor_time = current_time + def _calculate_latency(self, audio_time): + """计算音频延迟""" + current_time = time.time() + latency = current_time - audio_time + + # 添加到延迟样本 + self.latency_samples.append(latency) + if len(self.latency_samples) > self.max_latency_samples: + self.latency_samples.pop(0) + + return latency + + def _lightweight_recognition(self, recognizer, audio_data): + """轻量级识别处理""" + if not recognizer: + return None + + # 激进模式:跳过部分识别结果,只处理最终结果 + if self.skip_partial_results: + if recognizer.AcceptWaveform(audio_data): + result = json.loads(recognizer.Result()) + return result.get('text', '').strip() + else: + # 标准模式:处理部分和最终结果 + if recognizer.AcceptWaveform(audio_data): + result = json.loads(recognizer.Result()) + return result.get('text', '').strip() + else: + partial_result = json.loads(recognizer.PartialResult()) + return partial_result.get('partial', '').strip() + + return None + def _save_recording(self, audio_data): """保存录音""" timestamp = time.strftime("%Y%m%d_%H%M%S") @@ -273,7 +331,11 @@ class SimpleWakeAndRecord: # 为录音创建一个新的识别器 if self.model: self.recording_recognizer = KaldiRecognizer(self.model, self.RATE) - self.recording_recognizer.SetWords(True) + # 激进模式:禁用词级识别以提高性能 + if self.AGGRESSIVE_MODE: + self.recording_recognizer.SetWords(False) + else: + self.recording_recognizer.SetWords(True) def _stop_recording(self): """停止录音""" @@ -311,13 +373,19 @@ class SimpleWakeAndRecord: try: while self.running: # 读取音频数据 + receive_time = time.time() data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False) if len(data) == 0: continue + # 记录音频接收时间 + self.audio_receive_times.append(receive_time) + if len(self.audio_receive_times) > self.max_latency_samples: + self.audio_receive_times.pop(0) + if self.recording: - # 录音模式 - 直接处理 + # 录音模式 - 激进优化处理 self.recorded_frames.append(data) recording_duration = time.time() - self.recording_start_time @@ -328,33 +396,45 @@ class SimpleWakeAndRecord: if len(self.audio_buffer) > self.buffer_size: self.audio_buffer.pop(0) - # 批处理识别 - if self._should_process_audio() and self.recording_recognizer: - combined_data = self._process_audio_batch() - if combined_data and self.recording_recognizer.AcceptWaveform(combined_data): - # 获取最终识别结果 - result = json.loads(self.recording_recognizer.Result()) - text = result.get('text', '').strip() + # 激进模式:直接处理,不等待批处理 + if self.AGGRESSIVE_MODE and self.recording_recognizer: + # 直接处理当前音频块 + text = self._lightweight_recognition(self.recording_recognizer, data) + if text: + # 计算延迟 + if self.audio_receive_times: + latency = self._calculate_latency(self.audio_receive_times[0]) + self.audio_receive_times.pop(0) - if text: - # 识别到文字,更新时间戳 - self.last_text_time = time.time() - print(f"\n📝 识别: {text}") - elif combined_data: - # 获取部分识别结果 - partial_result = json.loads(self.recording_recognizer.PartialResult()) - partial_text = partial_result.get('partial', '').strip() - - if partial_text: - # 更新时间戳(部分识别也算有声音) - self.last_text_time = time.time() - status = f"录音中... {recording_duration:.1f}s | {partial_text}" - print(f"\r{status}", end='', flush=True) + # 识别到文字,更新时间戳 + self.last_text_time = time.time() + print(f"\n📝 识别: {text} (延迟: {latency:.2f}s)") + else: + # 标准批处理模式 + if self._should_process_audio() and self.recording_recognizer: + combined_data = self._process_audio_batch() + if combined_data: + text = self._lightweight_recognition(self.recording_recognizer, combined_data) + if text: + # 计算延迟 + if self.process_start_times: + process_start = self.process_start_times[0] + self.process_start_times.pop(0) + if self.audio_receive_times: + audio_time = self.audio_receive_times[0] + self.audio_receive_times.pop(0) + latency = process_start - audio_time + self._calculate_latency(audio_time) + + self.last_text_time = time.time() + print(f"\n📝 识别: {text}") # 检查是否需要结束录音 current_time = time.time() - # 检查是否有文字识别超时 + # 激进模式:缩短超时时间 + timeout_duration = 2.0 if self.AGGRESSIVE_MODE else 5.0 + if self.last_text_time is not None: text_silence_duration = current_time - self.last_text_time if text_silence_duration > self.text_silence_threshold and recording_duration >= self.min_recording_time: @@ -362,8 +442,8 @@ class SimpleWakeAndRecord: self._stop_recording() else: # 还没有识别到任何文字,检查是否超时 - if recording_duration > 5.0: # 如果5秒还没识别到任何文字,也结束 - print(f"\n\n5秒没有识别到文字,结束录音") + if recording_duration > timeout_duration: + print(f"\n\n{timeout_duration}秒没有识别到文字,结束录音") self._stop_recording() # 检查最大录音时间 @@ -377,41 +457,55 @@ class SimpleWakeAndRecord: print(f"\r{status}", end='', flush=True) elif self.model and self.recognizer: - # 唤醒词检测模式 - 使用批处理 - self.audio_buffer.append(data) - - # 限制缓冲区大小 - if len(self.audio_buffer) > self.buffer_size: - self.audio_buffer.pop(0) - - # 批处理识别 - if self._should_process_audio(): - combined_data = self._process_audio_batch() - if combined_data and self.recognizer.AcceptWaveform(combined_data): - result = json.loads(self.recognizer.Result()) - text = result.get('text', '').strip() + # 唤醒词检测模式 - 激进优化 + if self.AGGRESSIVE_MODE: + # 直接处理,不使用缓冲区 + text = self._lightweight_recognition(self.recognizer, data) + if text: + print(f"识别: {text}") - if text: - print(f"识别: {text}") - - # 检查唤醒词 - is_wake_word, detected_word = self._check_wake_word(text) - if is_wake_word: - print(f"🎯 检测到唤醒词: {detected_word}") - self._start_recording() - else: - # 显示实时音频级别 - energy = self._calculate_energy(data) - if energy > 50: # 只显示有意义的音频级别 - partial_result = json.loads(self.recognizer.PartialResult()) - partial_text = partial_result.get('partial', '') - if partial_text: - status = f"监听中... 能量: {energy:.0f} | {partial_text}" - else: - status = f"监听中... 能量: {energy:.0f}" - print(status, end='\r') + # 检查唤醒词 + is_wake_word, detected_word = self._check_wake_word(text) + if is_wake_word: + print(f"🎯 检测到唤醒词: {detected_word}") + self._start_recording() + + # 显示实时音频级别(仅在高能量时) + energy = self._calculate_energy(data) + if energy > 100: # 提高阈值,减少显示频率 + status = f"监听中... 能量: {energy:.0f}" + print(status, end='\r') + else: + # 标准批处理模式 + self.audio_buffer.append(data) + + # 限制缓冲区大小 + if len(self.audio_buffer) > self.buffer_size: + self.audio_buffer.pop(0) + + # 批处理识别 + if self._should_process_audio(): + combined_data = self._process_audio_batch() + if combined_data: + text = self._lightweight_recognition(self.recognizer, combined_data) + if text: + print(f"识别: {text}") + + # 检查唤醒词 + is_wake_word, detected_word = self._check_wake_word(text) + if is_wake_word: + print(f"🎯 检测到唤醒词: {detected_word}") + self._start_recording() + + # 显示实时音频级别 + energy = self._calculate_energy(data) + if energy > 50: + status = f"监听中... 能量: {energy:.0f}" + print(status, end='\r') - time.sleep(0.05) # 增加延迟,减少CPU使用 + # 激进模式:更长的延迟以减少CPU使用 + sleep_time = 0.1 if self.AGGRESSIVE_MODE else 0.05 + time.sleep(sleep_time) except KeyboardInterrupt: print("\n👋 退出") @@ -466,12 +560,17 @@ def main(): print("5. 录音文件自动保存") print("6. 录音完成后自动播放刚才录制的内容") print("7. 按 Ctrl+C 退出") - print("🚀 性能优化已启用:") + print("🚀 激进性能优化已启用:") print(" - 采样率: 8kHz (降低50%数据量)") - print(" - 批处理: 5个音频块/次") - print(" - 处理间隔: 0.5秒") - print(" - 缓冲区: 10个音频块") - print(" - 性能监控: 每5秒显示") + print(" - 块大小: 4096字节 (4倍于原始大小)") + print(" - 激进模式: 已启用 (直接处理,跳过部分结果)") + print(" - 批处理: 3个音频块/次") + print(" - 处理间隔: 0.2秒") + print(" - 缓冲区: 5个音频块") + print(" - 词级识别: 已禁用 (提高性能)") + print(" - 性能监控: 每3秒显示") + print(" - 延迟监控: 实时显示") + print(" - 预期延迟: <1秒 (原10秒)") print("=" * 50) # 开始运行