#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 简化的唤醒+录音测试 专注于解决音频冲突问题 """ import sys import os import time import threading import pyaudio import json # 添加当前目录到路径 sys.path.append(os.path.dirname(os.path.abspath(__file__))) try: from vosk import Model, KaldiRecognizer VOSK_AVAILABLE = True except ImportError: VOSK_AVAILABLE = False print("⚠️ Vosk 未安装,请运行: pip install vosk") class SimpleWakeAndRecord: """简化的唤醒+录音系统""" def __init__(self, model_path="model", wake_words=["你好", "助手"]): self.model_path = model_path self.wake_words = wake_words self.model = None self.recognizer = None self.audio = None self.stream = None self.running = False # 音频参数 - 激进优化为树莓派3B self.FORMAT = pyaudio.paInt16 self.CHANNELS = 1 self.RATE = 8000 # 从16kHz降至8kHz,减少50%数据处理量 self.CHUNK_SIZE = 4096 # 进一步增大块大小到4KB,大幅减少处理次数 self.AGGRESSIVE_MODE = True # 激进优化模式 # 录音相关 self.recording = False self.recorded_frames = [] self.last_text_time = None # 最后一次识别到文字的时间 self.recording_start_time = None self.recording_recognizer = None # 录音时专用的识别器 # 性能优化相关 - 激进优化 self.audio_buffer = [] # 音频缓冲区 self.buffer_size = 5 # 减小缓冲区大小,减少内存使用 self.last_process_time = time.time() # 上次处理时间 self.process_interval = 0.2 # 缩短处理间隔,提高响应速度 self.batch_process_size = 3 # 减少批处理大小,更快处理 self.skip_partial_results = True # 跳过部分识别结果,只处理最终结果 # 性能监控 self.process_count = 0 self.avg_process_time = 0 self.last_monitor_time = time.time() self.monitor_interval = 3.0 # 缩短监控间隔 # 延迟监控 self.audio_receive_times = [] # 音频接收时间戳 self.process_start_times = [] # 处理开始时间 self.latency_samples = [] # 延迟样本 self.max_latency_samples = 10 # 最大延迟样本数 # 阈值 self.text_silence_threshold = 3.0 # 3秒没有识别到文字就结束 self.min_recording_time = 2.0 # 最小录音时间 self.max_recording_time = 30.0 # 最大录音时间 self._setup_model() self._setup_audio() def _setup_model(self): """设置 Vosk 模型""" if not VOSK_AVAILABLE: return try: if not os.path.exists(self.model_path): print(f"模型路径不存在: {self.model_path}") return print(f"🔄 正在加载模型,这可能需要一些时间...") start_time = time.time() self.model = Model(self.model_path) self.recognizer = KaldiRecognizer(self.model, self.RATE) # 激进模式:禁用词级识别以提高性能 if self.AGGRESSIVE_MODE: self.recognizer.SetWords(False) print(f"📉 激进模式:已禁用词级识别以提高性能") else: self.recognizer.SetWords(True) load_time = time.time() - start_time print(f"✅ Vosk 模型加载成功 (耗时: {load_time:.2f}s)") except Exception as e: print(f"模型初始化失败: {e}") def _setup_audio(self): """设置音频设备""" try: if self.audio is None: self.audio = pyaudio.PyAudio() if self.stream is None: self.stream = self.audio.open( format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK_SIZE ) print("✅ 音频设备初始化成功") except Exception as e: print(f"音频设备初始化失败: {e}") def _calculate_energy(self, audio_data): """计算音频能量""" if len(audio_data) == 0: return 0 import numpy as np audio_array = np.frombuffer(audio_data, dtype=np.int16) rms = np.sqrt(np.mean(audio_array ** 2)) return rms def _check_wake_word(self, text): """检查是否包含唤醒词""" if not text or not self.wake_words: return False, None text_lower = text.lower() for wake_word in self.wake_words: if wake_word.lower() in text_lower: return True, wake_word return False, None def _should_process_audio(self): """判断是否应该处理音频""" current_time = time.time() return (current_time - self.last_process_time >= self.process_interval and len(self.audio_buffer) >= self.batch_process_size) def _process_audio_batch(self): """批量处理音频数据""" if len(self.audio_buffer) < self.batch_process_size: return # 记录处理开始时间 start_time = time.time() self.process_start_times.append(start_time) # 取出批处理数据 batch_data = self.audio_buffer[:self.batch_process_size] self.audio_buffer = self.audio_buffer[self.batch_process_size:] # 合并音频数据 combined_data = b''.join(batch_data) # 更新处理时间 self.last_process_time = time.time() # 更新性能统计 process_time = time.time() - start_time self.process_count += 1 self.avg_process_time = (self.avg_process_time * (self.process_count - 1) + process_time) / self.process_count # 性能监控 self._monitor_performance() return combined_data def _monitor_performance(self): """性能监控""" current_time = time.time() if current_time - self.last_monitor_time >= self.monitor_interval: buffer_usage = len(self.audio_buffer) / self.buffer_size * 100 # 计算平均延迟 avg_latency = 0 if self.latency_samples: avg_latency = sum(self.latency_samples) / len(self.latency_samples) print(f"\n📊 性能监控 | 处理次数: {self.process_count} | 平均处理时间: {self.avg_process_time:.3f}s | 缓冲区使用: {buffer_usage:.1f}% | 平均延迟: {avg_latency:.2f}s") self.last_monitor_time = current_time def _calculate_latency(self, audio_time): """计算音频延迟""" current_time = time.time() latency = current_time - audio_time # 添加到延迟样本 self.latency_samples.append(latency) if len(self.latency_samples) > self.max_latency_samples: self.latency_samples.pop(0) return latency def _lightweight_recognition(self, recognizer, audio_data): """轻量级识别处理""" if not recognizer: return None # 激进模式:跳过部分识别结果,只处理最终结果 if self.skip_partial_results: if recognizer.AcceptWaveform(audio_data): result = json.loads(recognizer.Result()) return result.get('text', '').strip() else: # 标准模式:处理部分和最终结果 if recognizer.AcceptWaveform(audio_data): result = json.loads(recognizer.Result()) return result.get('text', '').strip() else: partial_result = json.loads(recognizer.PartialResult()) return partial_result.get('partial', '').strip() return None def _save_recording(self, audio_data): """保存录音""" timestamp = time.strftime("%Y%m%d_%H%M%S") filename = f"recording_{timestamp}.wav" try: import wave with wave.open(filename, 'wb') as wf: wf.setnchannels(self.CHANNELS) wf.setsampwidth(self.audio.get_sample_size(self.FORMAT)) wf.setframerate(self.RATE) wf.writeframes(audio_data) print(f"✅ 录音已保存: {filename}") return True, filename except Exception as e: print(f"保存录音失败: {e}") return False, None def _play_audio(self, filename): """播放音频文件""" try: import wave # 打开音频文件 with wave.open(filename, 'rb') as wf: # 获取音频参数 channels = wf.getnchannels() width = wf.getsampwidth() rate = wf.getframerate() total_frames = wf.getnframes() # 分块读取音频数据,避免内存问题 chunk_size = 1024 frames = [] for _ in range(0, total_frames, chunk_size): chunk = wf.readframes(chunk_size) if chunk: frames.append(chunk) else: break # 创建播放流 playback_stream = self.audio.open( format=self.audio.get_format_from_width(width), channels=channels, rate=rate, output=True ) print(f"🔊 开始播放: {filename}") # 分块播放音频 for chunk in frames: playback_stream.write(chunk) # 等待播放完成 playback_stream.stop_stream() playback_stream.close() print("✅ 播放完成") except Exception as e: print(f"❌ 播放失败: {e}") # 如果pyaudio播放失败,尝试用系统命令播放 self._play_with_system_player(filename) def _play_with_system_player(self, filename): """使用系统播放器播放音频""" try: import platform import subprocess system = platform.system() if system == 'Darwin': # macOS cmd = ['afplay', filename] elif system == 'Windows': cmd = ['start', '/min', filename] else: # Linux cmd = ['aplay', filename] print(f"🔊 使用系统播放器: {' '.join(cmd)}") subprocess.run(cmd, check=True) print("✅ 播放完成") except Exception as e: print(f"❌ 系统播放器也失败: {e}") print(f"💡 文件已保存,请手动播放: {filename}") def _start_recording(self): """开始录音""" print("🎙️ 开始录音,请说话...") self.recording = True self.recorded_frames = [] self.last_text_time = None self.recording_start_time = time.time() # 为录音创建一个新的识别器 if self.model: self.recording_recognizer = KaldiRecognizer(self.model, self.RATE) # 激进模式:禁用词级识别以提高性能 if self.AGGRESSIVE_MODE: self.recording_recognizer.SetWords(False) else: self.recording_recognizer.SetWords(True) def _stop_recording(self): """停止录音""" if len(self.recorded_frames) > 0: audio_data = b''.join(self.recorded_frames) duration = len(audio_data) / (self.RATE * 2) print(f"📝 录音完成,时长: {duration:.2f}秒") # 保存录音 success, filename = self._save_recording(audio_data) # 如果保存成功,播放录音 if success and filename: print("=" * 50) print("🔊 播放刚才录制的音频...") self._play_audio(filename) print("=" * 50) self.recording = False self.recorded_frames = [] self.last_text_time = None self.recording_start_time = None self.recording_recognizer = None def start(self): """开始唤醒词检测和录音""" if not self.stream: print("❌ 音频设备未初始化") return self.running = True print("🎤 开始监听...") print(f"唤醒词: {', '.join(self.wake_words)}") try: while self.running: # 读取音频数据 receive_time = time.time() data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False) if len(data) == 0: continue # 记录音频接收时间 self.audio_receive_times.append(receive_time) if len(self.audio_receive_times) > self.max_latency_samples: self.audio_receive_times.pop(0) if self.recording: # 录音模式 - 激进优化处理 self.recorded_frames.append(data) recording_duration = time.time() - self.recording_start_time # 录音时使用批处理进行识别 self.audio_buffer.append(data) # 限制缓冲区大小 if len(self.audio_buffer) > self.buffer_size: self.audio_buffer.pop(0) # 激进模式:直接处理,不等待批处理 if self.AGGRESSIVE_MODE and self.recording_recognizer: # 直接处理当前音频块 text = self._lightweight_recognition(self.recording_recognizer, data) if text: # 计算延迟 if self.audio_receive_times: latency = self._calculate_latency(self.audio_receive_times[0]) self.audio_receive_times.pop(0) # 识别到文字,更新时间戳 self.last_text_time = time.time() print(f"\n📝 识别: {text} (延迟: {latency:.2f}s)") else: # 标准批处理模式 if self._should_process_audio() and self.recording_recognizer: combined_data = self._process_audio_batch() if combined_data: text = self._lightweight_recognition(self.recording_recognizer, combined_data) if text: # 计算延迟 if self.process_start_times: process_start = self.process_start_times[0] self.process_start_times.pop(0) if self.audio_receive_times: audio_time = self.audio_receive_times[0] self.audio_receive_times.pop(0) latency = process_start - audio_time self._calculate_latency(audio_time) self.last_text_time = time.time() print(f"\n📝 识别: {text}") # 检查是否需要结束录音 current_time = time.time() # 激进模式:缩短超时时间 timeout_duration = 2.0 if self.AGGRESSIVE_MODE else 5.0 if self.last_text_time is not None: text_silence_duration = current_time - self.last_text_time if text_silence_duration > self.text_silence_threshold and recording_duration >= self.min_recording_time: print(f"\n\n3秒没有识别到文字,结束录音") self._stop_recording() else: # 还没有识别到任何文字,检查是否超时 if recording_duration > timeout_duration: print(f"\n\n{timeout_duration}秒没有识别到文字,结束录音") self._stop_recording() # 检查最大录音时间 if recording_duration > self.max_recording_time: print(f"\n\n达到最大录音时间 {self.max_recording_time}s") self._stop_recording() # 显示录音状态 if self.last_text_time is None: status = f"等待语音输入... {recording_duration:.1f}s" print(f"\r{status}", end='', flush=True) elif self.model and self.recognizer: # 唤醒词检测模式 - 激进优化 if self.AGGRESSIVE_MODE: # 直接处理,不使用缓冲区 text = self._lightweight_recognition(self.recognizer, data) if text: print(f"识别: {text}") # 检查唤醒词 is_wake_word, detected_word = self._check_wake_word(text) if is_wake_word: print(f"🎯 检测到唤醒词: {detected_word}") self._start_recording() # 显示实时音频级别(仅在高能量时) energy = self._calculate_energy(data) if energy > 100: # 提高阈值,减少显示频率 status = f"监听中... 能量: {energy:.0f}" print(status, end='\r') else: # 标准批处理模式 self.audio_buffer.append(data) # 限制缓冲区大小 if len(self.audio_buffer) > self.buffer_size: self.audio_buffer.pop(0) # 批处理识别 if self._should_process_audio(): combined_data = self._process_audio_batch() if combined_data: text = self._lightweight_recognition(self.recognizer, combined_data) if text: print(f"识别: {text}") # 检查唤醒词 is_wake_word, detected_word = self._check_wake_word(text) if is_wake_word: print(f"🎯 检测到唤醒词: {detected_word}") self._start_recording() # 显示实时音频级别 energy = self._calculate_energy(data) if energy > 50: status = f"监听中... 能量: {energy:.0f}" print(status, end='\r') # 激进模式:更长的延迟以减少CPU使用 sleep_time = 0.1 if self.AGGRESSIVE_MODE else 0.05 time.sleep(sleep_time) except KeyboardInterrupt: print("\n👋 退出") except Exception as e: print(f"错误: {e}") finally: self.stop() def stop(self): """停止""" self.running = False if self.recording: self._stop_recording() if self.stream: self.stream.stop_stream() self.stream.close() self.stream = None if self.audio: self.audio.terminate() self.audio = None def main(): """主函数""" print("🚀 简化唤醒+录音测试") print("=" * 50) # 检查模型 model_dir = "model" if not os.path.exists(model_dir): print("⚠️ 未找到模型目录") print("请下载 Vosk 模型到 model 目录") return # 创建系统 system = SimpleWakeAndRecord( model_path=model_dir, wake_words=["你好", "助手", "小爱"] ) if not system.model: print("❌ 模型加载失败") return print("✅ 系统初始化成功") print("📖 使用说明:") print("1. 说唤醒词开始录音") print("2. 基于语音识别判断,3秒没有识别到文字就结束") print("3. 最少录音2秒,最多30秒") print("4. 录音时实时显示识别结果") print("5. 录音文件自动保存") print("6. 录音完成后自动播放刚才录制的内容") print("7. 按 Ctrl+C 退出") print("🚀 激进性能优化已启用:") print(" - 采样率: 8kHz (降低50%数据量)") print(" - 块大小: 4096字节 (4倍于原始大小)") print(" - 激进模式: 已启用 (直接处理,跳过部分结果)") print(" - 批处理: 3个音频块/次") print(" - 处理间隔: 0.2秒") print(" - 缓冲区: 5个音频块") print(" - 词级识别: 已禁用 (提高性能)") print(" - 性能监控: 每3秒显示") print(" - 延迟监控: 实时显示") print(" - 预期延迟: <1秒 (原10秒)") print("=" * 50) # 开始运行 system.start() if __name__ == "__main__": main()