激进性能优化:大幅降低树莓派3B延迟

- 音频参数:8kHz采样率,4096块大小(4倍)
- 激进模式:直接处理,跳过部分识别结果
- 缓冲优化:5个块缓冲区,0.2秒处理间隔
- 禁用词级识别:提升Vosk处理速度
- 实时延迟监控:显示音频处理延迟
- 预期效果:从10秒延迟降低到<1秒

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
朱潮 2025-09-20 11:07:54 +08:00
parent eb099d827d
commit 70c42eca15

View File

@ -35,11 +35,12 @@ class SimpleWakeAndRecord:
self.stream = None
self.running = False
# 音频参数 - 优化为树莓派3B
# 音频参数 - 激进优化为树莓派3B
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 8000 # 从16kHz降至8kHz减少50%数据处理量
self.CHUNK_SIZE = 2048 # 增大块大小,减少处理次数
self.CHUNK_SIZE = 4096 # 进一步增大块大小到4KB大幅减少处理次数
self.AGGRESSIVE_MODE = True # 激进优化模式
# 录音相关
self.recording = False
@ -48,18 +49,25 @@ class SimpleWakeAndRecord:
self.recording_start_time = None
self.recording_recognizer = None # 录音时专用的识别器
# 性能优化相关
# 性能优化相关 - 激进优化
self.audio_buffer = [] # 音频缓冲区
self.buffer_size = 10 # 缓冲区大小(块数)
self.buffer_size = 5 # 减小缓冲区大小,减少内存使用
self.last_process_time = time.time() # 上次处理时间
self.process_interval = 0.5 # 处理间隔(秒)
self.batch_process_size = 5 # 批处理大小
self.process_interval = 0.2 # 缩短处理间隔,提高响应速度
self.batch_process_size = 3 # 减少批处理大小,更快处理
self.skip_partial_results = True # 跳过部分识别结果,只处理最终结果
# 性能监控
self.process_count = 0
self.avg_process_time = 0
self.last_monitor_time = time.time()
self.monitor_interval = 5.0 # 监控间隔(秒)
self.monitor_interval = 3.0 # 缩短监控间隔
# 延迟监控
self.audio_receive_times = [] # 音频接收时间戳
self.process_start_times = [] # 处理开始时间
self.latency_samples = [] # 延迟样本
self.max_latency_samples = 10 # 最大延迟样本数
# 阈值
self.text_silence_threshold = 3.0 # 3秒没有识别到文字就结束
@ -79,11 +87,21 @@ class SimpleWakeAndRecord:
print(f"模型路径不存在: {self.model_path}")
return
print(f"🔄 正在加载模型,这可能需要一些时间...")
start_time = time.time()
self.model = Model(self.model_path)
self.recognizer = KaldiRecognizer(self.model, self.RATE)
# 激进模式:禁用词级识别以提高性能
if self.AGGRESSIVE_MODE:
self.recognizer.SetWords(False)
print(f"📉 激进模式:已禁用词级识别以提高性能")
else:
self.recognizer.SetWords(True)
print(f"✅ Vosk 模型加载成功")
load_time = time.time() - start_time
print(f"✅ Vosk 模型加载成功 (耗时: {load_time:.2f}s)")
except Exception as e:
print(f"模型初始化失败: {e}")
@ -142,6 +160,7 @@ class SimpleWakeAndRecord:
# 记录处理开始时间
start_time = time.time()
self.process_start_times.append(start_time)
# 取出批处理数据
batch_data = self.audio_buffer[:self.batch_process_size]
@ -168,9 +187,48 @@ class SimpleWakeAndRecord:
current_time = time.time()
if current_time - self.last_monitor_time >= self.monitor_interval:
buffer_usage = len(self.audio_buffer) / self.buffer_size * 100
print(f"\n📊 性能监控 | 处理次数: {self.process_count} | 平均处理时间: {self.avg_process_time:.3f}s | 缓冲区使用: {buffer_usage:.1f}%")
# 计算平均延迟
avg_latency = 0
if self.latency_samples:
avg_latency = sum(self.latency_samples) / len(self.latency_samples)
print(f"\n📊 性能监控 | 处理次数: {self.process_count} | 平均处理时间: {self.avg_process_time:.3f}s | 缓冲区使用: {buffer_usage:.1f}% | 平均延迟: {avg_latency:.2f}s")
self.last_monitor_time = current_time
def _calculate_latency(self, audio_time):
"""计算音频延迟"""
current_time = time.time()
latency = current_time - audio_time
# 添加到延迟样本
self.latency_samples.append(latency)
if len(self.latency_samples) > self.max_latency_samples:
self.latency_samples.pop(0)
return latency
def _lightweight_recognition(self, recognizer, audio_data):
"""轻量级识别处理"""
if not recognizer:
return None
# 激进模式:跳过部分识别结果,只处理最终结果
if self.skip_partial_results:
if recognizer.AcceptWaveform(audio_data):
result = json.loads(recognizer.Result())
return result.get('text', '').strip()
else:
# 标准模式:处理部分和最终结果
if recognizer.AcceptWaveform(audio_data):
result = json.loads(recognizer.Result())
return result.get('text', '').strip()
else:
partial_result = json.loads(recognizer.PartialResult())
return partial_result.get('partial', '').strip()
return None
def _save_recording(self, audio_data):
"""保存录音"""
timestamp = time.strftime("%Y%m%d_%H%M%S")
@ -273,6 +331,10 @@ class SimpleWakeAndRecord:
# 为录音创建一个新的识别器
if self.model:
self.recording_recognizer = KaldiRecognizer(self.model, self.RATE)
# 激进模式:禁用词级识别以提高性能
if self.AGGRESSIVE_MODE:
self.recording_recognizer.SetWords(False)
else:
self.recording_recognizer.SetWords(True)
def _stop_recording(self):
@ -311,13 +373,19 @@ class SimpleWakeAndRecord:
try:
while self.running:
# 读取音频数据
receive_time = time.time()
data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False)
if len(data) == 0:
continue
# 记录音频接收时间
self.audio_receive_times.append(receive_time)
if len(self.audio_receive_times) > self.max_latency_samples:
self.audio_receive_times.pop(0)
if self.recording:
# 录音模式 - 直接处理
# 录音模式 - 激进优化处理
self.recorded_frames.append(data)
recording_duration = time.time() - self.recording_start_time
@ -328,33 +396,45 @@ class SimpleWakeAndRecord:
if len(self.audio_buffer) > self.buffer_size:
self.audio_buffer.pop(0)
# 批处理识别
if self._should_process_audio() and self.recording_recognizer:
combined_data = self._process_audio_batch()
if combined_data and self.recording_recognizer.AcceptWaveform(combined_data):
# 获取最终识别结果
result = json.loads(self.recording_recognizer.Result())
text = result.get('text', '').strip()
# 激进模式:直接处理,不等待批处理
if self.AGGRESSIVE_MODE and self.recording_recognizer:
# 直接处理当前音频块
text = self._lightweight_recognition(self.recording_recognizer, data)
if text:
# 计算延迟
if self.audio_receive_times:
latency = self._calculate_latency(self.audio_receive_times[0])
self.audio_receive_times.pop(0)
# 识别到文字,更新时间戳
self.last_text_time = time.time()
print(f"\n📝 识别: {text}")
elif combined_data:
# 获取部分识别结果
partial_result = json.loads(self.recording_recognizer.PartialResult())
partial_text = partial_result.get('partial', '').strip()
print(f"\n📝 识别: {text} (延迟: {latency:.2f}s)")
else:
# 标准批处理模式
if self._should_process_audio() and self.recording_recognizer:
combined_data = self._process_audio_batch()
if combined_data:
text = self._lightweight_recognition(self.recording_recognizer, combined_data)
if text:
# 计算延迟
if self.process_start_times:
process_start = self.process_start_times[0]
self.process_start_times.pop(0)
if self.audio_receive_times:
audio_time = self.audio_receive_times[0]
self.audio_receive_times.pop(0)
latency = process_start - audio_time
self._calculate_latency(audio_time)
if partial_text:
# 更新时间戳(部分识别也算有声音)
self.last_text_time = time.time()
status = f"录音中... {recording_duration:.1f}s | {partial_text}"
print(f"\r{status}", end='', flush=True)
print(f"\n📝 识别: {text}")
# 检查是否需要结束录音
current_time = time.time()
# 检查是否有文字识别超时
# 激进模式:缩短超时时间
timeout_duration = 2.0 if self.AGGRESSIVE_MODE else 5.0
if self.last_text_time is not None:
text_silence_duration = current_time - self.last_text_time
if text_silence_duration > self.text_silence_threshold and recording_duration >= self.min_recording_time:
@ -362,8 +442,8 @@ class SimpleWakeAndRecord:
self._stop_recording()
else:
# 还没有识别到任何文字,检查是否超时
if recording_duration > 5.0: # 如果5秒还没识别到任何文字也结束
print(f"\n\n5秒没有识别到文字,结束录音")
if recording_duration > timeout_duration:
print(f"\n\n{timeout_duration}秒没有识别到文字,结束录音")
self._stop_recording()
# 检查最大录音时间
@ -377,7 +457,26 @@ class SimpleWakeAndRecord:
print(f"\r{status}", end='', flush=True)
elif self.model and self.recognizer:
# 唤醒词检测模式 - 使用批处理
# 唤醒词检测模式 - 激进优化
if self.AGGRESSIVE_MODE:
# 直接处理,不使用缓冲区
text = self._lightweight_recognition(self.recognizer, data)
if text:
print(f"识别: {text}")
# 检查唤醒词
is_wake_word, detected_word = self._check_wake_word(text)
if is_wake_word:
print(f"🎯 检测到唤醒词: {detected_word}")
self._start_recording()
# 显示实时音频级别(仅在高能量时)
energy = self._calculate_energy(data)
if energy > 100: # 提高阈值,减少显示频率
status = f"监听中... 能量: {energy:.0f}"
print(status, end='\r')
else:
# 标准批处理模式
self.audio_buffer.append(data)
# 限制缓冲区大小
@ -387,10 +486,8 @@ class SimpleWakeAndRecord:
# 批处理识别
if self._should_process_audio():
combined_data = self._process_audio_batch()
if combined_data and self.recognizer.AcceptWaveform(combined_data):
result = json.loads(self.recognizer.Result())
text = result.get('text', '').strip()
if combined_data:
text = self._lightweight_recognition(self.recognizer, combined_data)
if text:
print(f"识别: {text}")
@ -399,19 +496,16 @@ class SimpleWakeAndRecord:
if is_wake_word:
print(f"🎯 检测到唤醒词: {detected_word}")
self._start_recording()
else:
# 显示实时音频级别
energy = self._calculate_energy(data)
if energy > 50: # 只显示有意义的音频级别
partial_result = json.loads(self.recognizer.PartialResult())
partial_text = partial_result.get('partial', '')
if partial_text:
status = f"监听中... 能量: {energy:.0f} | {partial_text}"
else:
if energy > 50:
status = f"监听中... 能量: {energy:.0f}"
print(status, end='\r')
time.sleep(0.05) # 增加延迟减少CPU使用
# 激进模式更长的延迟以减少CPU使用
sleep_time = 0.1 if self.AGGRESSIVE_MODE else 0.05
time.sleep(sleep_time)
except KeyboardInterrupt:
print("\n👋 退出")
@ -466,12 +560,17 @@ def main():
print("5. 录音文件自动保存")
print("6. 录音完成后自动播放刚才录制的内容")
print("7. 按 Ctrl+C 退出")
print("🚀 性能优化已启用:")
print("🚀 激进性能优化已启用:")
print(" - 采样率: 8kHz (降低50%数据量)")
print(" - 批处理: 5个音频块/次")
print(" - 处理间隔: 0.5秒")
print(" - 缓冲区: 10个音频块")
print(" - 性能监控: 每5秒显示")
print(" - 块大小: 4096字节 (4倍于原始大小)")
print(" - 激进模式: 已启用 (直接处理,跳过部分结果)")
print(" - 批处理: 3个音频块/次")
print(" - 处理间隔: 0.2秒")
print(" - 缓冲区: 5个音频块")
print(" - 词级识别: 已禁用 (提高性能)")
print(" - 性能监控: 每3秒显示")
print(" - 延迟监控: 实时显示")
print(" - 预期延迟: <1秒 (原10秒)")
print("=" * 50)
# 开始运行