Local-Voice/simple_wake_and_record.py
朱潮 70c42eca15 激进性能优化:大幅降低树莓派3B延迟
- 音频参数:8kHz采样率,4096块大小(4倍)
- 激进模式:直接处理,跳过部分识别结果
- 缓冲优化:5个块缓冲区,0.2秒处理间隔
- 禁用词级识别:提升Vosk处理速度
- 实时延迟监控:显示音频处理延迟
- 预期效果:从10秒延迟降低到<1秒

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-20 11:07:54 +08:00

580 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
简化的唤醒+录音测试
专注于解决音频冲突问题
"""
import sys
import os
import time
import threading
import pyaudio
import json
# 添加当前目录到路径
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
try:
from vosk import Model, KaldiRecognizer
VOSK_AVAILABLE = True
except ImportError:
VOSK_AVAILABLE = False
print("⚠️ Vosk 未安装,请运行: pip install vosk")
class SimpleWakeAndRecord:
"""简化的唤醒+录音系统"""
def __init__(self, model_path="model", wake_words=["你好", "助手"]):
self.model_path = model_path
self.wake_words = wake_words
self.model = None
self.recognizer = None
self.audio = None
self.stream = None
self.running = False
# 音频参数 - 激进优化为树莓派3B
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 8000 # 从16kHz降至8kHz减少50%数据处理量
self.CHUNK_SIZE = 4096 # 进一步增大块大小到4KB大幅减少处理次数
self.AGGRESSIVE_MODE = True # 激进优化模式
# 录音相关
self.recording = False
self.recorded_frames = []
self.last_text_time = None # 最后一次识别到文字的时间
self.recording_start_time = None
self.recording_recognizer = None # 录音时专用的识别器
# 性能优化相关 - 激进优化
self.audio_buffer = [] # 音频缓冲区
self.buffer_size = 5 # 减小缓冲区大小,减少内存使用
self.last_process_time = time.time() # 上次处理时间
self.process_interval = 0.2 # 缩短处理间隔,提高响应速度
self.batch_process_size = 3 # 减少批处理大小,更快处理
self.skip_partial_results = True # 跳过部分识别结果,只处理最终结果
# 性能监控
self.process_count = 0
self.avg_process_time = 0
self.last_monitor_time = time.time()
self.monitor_interval = 3.0 # 缩短监控间隔
# 延迟监控
self.audio_receive_times = [] # 音频接收时间戳
self.process_start_times = [] # 处理开始时间
self.latency_samples = [] # 延迟样本
self.max_latency_samples = 10 # 最大延迟样本数
# 阈值
self.text_silence_threshold = 3.0 # 3秒没有识别到文字就结束
self.min_recording_time = 2.0 # 最小录音时间
self.max_recording_time = 30.0 # 最大录音时间
self._setup_model()
self._setup_audio()
def _setup_model(self):
"""设置 Vosk 模型"""
if not VOSK_AVAILABLE:
return
try:
if not os.path.exists(self.model_path):
print(f"模型路径不存在: {self.model_path}")
return
print(f"🔄 正在加载模型,这可能需要一些时间...")
start_time = time.time()
self.model = Model(self.model_path)
self.recognizer = KaldiRecognizer(self.model, self.RATE)
# 激进模式:禁用词级识别以提高性能
if self.AGGRESSIVE_MODE:
self.recognizer.SetWords(False)
print(f"📉 激进模式:已禁用词级识别以提高性能")
else:
self.recognizer.SetWords(True)
load_time = time.time() - start_time
print(f"✅ Vosk 模型加载成功 (耗时: {load_time:.2f}s)")
except Exception as e:
print(f"模型初始化失败: {e}")
def _setup_audio(self):
"""设置音频设备"""
try:
if self.audio is None:
self.audio = pyaudio.PyAudio()
if self.stream is None:
self.stream = self.audio.open(
format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK_SIZE
)
print("✅ 音频设备初始化成功")
except Exception as e:
print(f"音频设备初始化失败: {e}")
def _calculate_energy(self, audio_data):
"""计算音频能量"""
if len(audio_data) == 0:
return 0
import numpy as np
audio_array = np.frombuffer(audio_data, dtype=np.int16)
rms = np.sqrt(np.mean(audio_array ** 2))
return rms
def _check_wake_word(self, text):
"""检查是否包含唤醒词"""
if not text or not self.wake_words:
return False, None
text_lower = text.lower()
for wake_word in self.wake_words:
if wake_word.lower() in text_lower:
return True, wake_word
return False, None
def _should_process_audio(self):
"""判断是否应该处理音频"""
current_time = time.time()
return (current_time - self.last_process_time >= self.process_interval and
len(self.audio_buffer) >= self.batch_process_size)
def _process_audio_batch(self):
"""批量处理音频数据"""
if len(self.audio_buffer) < self.batch_process_size:
return
# 记录处理开始时间
start_time = time.time()
self.process_start_times.append(start_time)
# 取出批处理数据
batch_data = self.audio_buffer[:self.batch_process_size]
self.audio_buffer = self.audio_buffer[self.batch_process_size:]
# 合并音频数据
combined_data = b''.join(batch_data)
# 更新处理时间
self.last_process_time = time.time()
# 更新性能统计
process_time = time.time() - start_time
self.process_count += 1
self.avg_process_time = (self.avg_process_time * (self.process_count - 1) + process_time) / self.process_count
# 性能监控
self._monitor_performance()
return combined_data
def _monitor_performance(self):
"""性能监控"""
current_time = time.time()
if current_time - self.last_monitor_time >= self.monitor_interval:
buffer_usage = len(self.audio_buffer) / self.buffer_size * 100
# 计算平均延迟
avg_latency = 0
if self.latency_samples:
avg_latency = sum(self.latency_samples) / len(self.latency_samples)
print(f"\n📊 性能监控 | 处理次数: {self.process_count} | 平均处理时间: {self.avg_process_time:.3f}s | 缓冲区使用: {buffer_usage:.1f}% | 平均延迟: {avg_latency:.2f}s")
self.last_monitor_time = current_time
def _calculate_latency(self, audio_time):
"""计算音频延迟"""
current_time = time.time()
latency = current_time - audio_time
# 添加到延迟样本
self.latency_samples.append(latency)
if len(self.latency_samples) > self.max_latency_samples:
self.latency_samples.pop(0)
return latency
def _lightweight_recognition(self, recognizer, audio_data):
"""轻量级识别处理"""
if not recognizer:
return None
# 激进模式:跳过部分识别结果,只处理最终结果
if self.skip_partial_results:
if recognizer.AcceptWaveform(audio_data):
result = json.loads(recognizer.Result())
return result.get('text', '').strip()
else:
# 标准模式:处理部分和最终结果
if recognizer.AcceptWaveform(audio_data):
result = json.loads(recognizer.Result())
return result.get('text', '').strip()
else:
partial_result = json.loads(recognizer.PartialResult())
return partial_result.get('partial', '').strip()
return None
def _save_recording(self, audio_data):
"""保存录音"""
timestamp = time.strftime("%Y%m%d_%H%M%S")
filename = f"recording_{timestamp}.wav"
try:
import wave
with wave.open(filename, 'wb') as wf:
wf.setnchannels(self.CHANNELS)
wf.setsampwidth(self.audio.get_sample_size(self.FORMAT))
wf.setframerate(self.RATE)
wf.writeframes(audio_data)
print(f"✅ 录音已保存: {filename}")
return True, filename
except Exception as e:
print(f"保存录音失败: {e}")
return False, None
def _play_audio(self, filename):
"""播放音频文件"""
try:
import wave
# 打开音频文件
with wave.open(filename, 'rb') as wf:
# 获取音频参数
channels = wf.getnchannels()
width = wf.getsampwidth()
rate = wf.getframerate()
total_frames = wf.getnframes()
# 分块读取音频数据,避免内存问题
chunk_size = 1024
frames = []
for _ in range(0, total_frames, chunk_size):
chunk = wf.readframes(chunk_size)
if chunk:
frames.append(chunk)
else:
break
# 创建播放流
playback_stream = self.audio.open(
format=self.audio.get_format_from_width(width),
channels=channels,
rate=rate,
output=True
)
print(f"🔊 开始播放: {filename}")
# 分块播放音频
for chunk in frames:
playback_stream.write(chunk)
# 等待播放完成
playback_stream.stop_stream()
playback_stream.close()
print("✅ 播放完成")
except Exception as e:
print(f"❌ 播放失败: {e}")
# 如果pyaudio播放失败尝试用系统命令播放
self._play_with_system_player(filename)
def _play_with_system_player(self, filename):
"""使用系统播放器播放音频"""
try:
import platform
import subprocess
system = platform.system()
if system == 'Darwin': # macOS
cmd = ['afplay', filename]
elif system == 'Windows':
cmd = ['start', '/min', filename]
else: # Linux
cmd = ['aplay', filename]
print(f"🔊 使用系统播放器: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
print("✅ 播放完成")
except Exception as e:
print(f"❌ 系统播放器也失败: {e}")
print(f"💡 文件已保存,请手动播放: {filename}")
def _start_recording(self):
"""开始录音"""
print("🎙️ 开始录音,请说话...")
self.recording = True
self.recorded_frames = []
self.last_text_time = None
self.recording_start_time = time.time()
# 为录音创建一个新的识别器
if self.model:
self.recording_recognizer = KaldiRecognizer(self.model, self.RATE)
# 激进模式:禁用词级识别以提高性能
if self.AGGRESSIVE_MODE:
self.recording_recognizer.SetWords(False)
else:
self.recording_recognizer.SetWords(True)
def _stop_recording(self):
"""停止录音"""
if len(self.recorded_frames) > 0:
audio_data = b''.join(self.recorded_frames)
duration = len(audio_data) / (self.RATE * 2)
print(f"📝 录音完成,时长: {duration:.2f}")
# 保存录音
success, filename = self._save_recording(audio_data)
# 如果保存成功,播放录音
if success and filename:
print("=" * 50)
print("🔊 播放刚才录制的音频...")
self._play_audio(filename)
print("=" * 50)
self.recording = False
self.recorded_frames = []
self.last_text_time = None
self.recording_start_time = None
self.recording_recognizer = None
def start(self):
"""开始唤醒词检测和录音"""
if not self.stream:
print("❌ 音频设备未初始化")
return
self.running = True
print("🎤 开始监听...")
print(f"唤醒词: {', '.join(self.wake_words)}")
try:
while self.running:
# 读取音频数据
receive_time = time.time()
data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False)
if len(data) == 0:
continue
# 记录音频接收时间
self.audio_receive_times.append(receive_time)
if len(self.audio_receive_times) > self.max_latency_samples:
self.audio_receive_times.pop(0)
if self.recording:
# 录音模式 - 激进优化处理
self.recorded_frames.append(data)
recording_duration = time.time() - self.recording_start_time
# 录音时使用批处理进行识别
self.audio_buffer.append(data)
# 限制缓冲区大小
if len(self.audio_buffer) > self.buffer_size:
self.audio_buffer.pop(0)
# 激进模式:直接处理,不等待批处理
if self.AGGRESSIVE_MODE and self.recording_recognizer:
# 直接处理当前音频块
text = self._lightweight_recognition(self.recording_recognizer, data)
if text:
# 计算延迟
if self.audio_receive_times:
latency = self._calculate_latency(self.audio_receive_times[0])
self.audio_receive_times.pop(0)
# 识别到文字,更新时间戳
self.last_text_time = time.time()
print(f"\n📝 识别: {text} (延迟: {latency:.2f}s)")
else:
# 标准批处理模式
if self._should_process_audio() and self.recording_recognizer:
combined_data = self._process_audio_batch()
if combined_data:
text = self._lightweight_recognition(self.recording_recognizer, combined_data)
if text:
# 计算延迟
if self.process_start_times:
process_start = self.process_start_times[0]
self.process_start_times.pop(0)
if self.audio_receive_times:
audio_time = self.audio_receive_times[0]
self.audio_receive_times.pop(0)
latency = process_start - audio_time
self._calculate_latency(audio_time)
self.last_text_time = time.time()
print(f"\n📝 识别: {text}")
# 检查是否需要结束录音
current_time = time.time()
# 激进模式:缩短超时时间
timeout_duration = 2.0 if self.AGGRESSIVE_MODE else 5.0
if self.last_text_time is not None:
text_silence_duration = current_time - self.last_text_time
if text_silence_duration > self.text_silence_threshold and recording_duration >= self.min_recording_time:
print(f"\n\n3秒没有识别到文字结束录音")
self._stop_recording()
else:
# 还没有识别到任何文字,检查是否超时
if recording_duration > timeout_duration:
print(f"\n\n{timeout_duration}秒没有识别到文字,结束录音")
self._stop_recording()
# 检查最大录音时间
if recording_duration > self.max_recording_time:
print(f"\n\n达到最大录音时间 {self.max_recording_time}s")
self._stop_recording()
# 显示录音状态
if self.last_text_time is None:
status = f"等待语音输入... {recording_duration:.1f}s"
print(f"\r{status}", end='', flush=True)
elif self.model and self.recognizer:
# 唤醒词检测模式 - 激进优化
if self.AGGRESSIVE_MODE:
# 直接处理,不使用缓冲区
text = self._lightweight_recognition(self.recognizer, data)
if text:
print(f"识别: {text}")
# 检查唤醒词
is_wake_word, detected_word = self._check_wake_word(text)
if is_wake_word:
print(f"🎯 检测到唤醒词: {detected_word}")
self._start_recording()
# 显示实时音频级别(仅在高能量时)
energy = self._calculate_energy(data)
if energy > 100: # 提高阈值,减少显示频率
status = f"监听中... 能量: {energy:.0f}"
print(status, end='\r')
else:
# 标准批处理模式
self.audio_buffer.append(data)
# 限制缓冲区大小
if len(self.audio_buffer) > self.buffer_size:
self.audio_buffer.pop(0)
# 批处理识别
if self._should_process_audio():
combined_data = self._process_audio_batch()
if combined_data:
text = self._lightweight_recognition(self.recognizer, combined_data)
if text:
print(f"识别: {text}")
# 检查唤醒词
is_wake_word, detected_word = self._check_wake_word(text)
if is_wake_word:
print(f"🎯 检测到唤醒词: {detected_word}")
self._start_recording()
# 显示实时音频级别
energy = self._calculate_energy(data)
if energy > 50:
status = f"监听中... 能量: {energy:.0f}"
print(status, end='\r')
# 激进模式更长的延迟以减少CPU使用
sleep_time = 0.1 if self.AGGRESSIVE_MODE else 0.05
time.sleep(sleep_time)
except KeyboardInterrupt:
print("\n👋 退出")
except Exception as e:
print(f"错误: {e}")
finally:
self.stop()
def stop(self):
"""停止"""
self.running = False
if self.recording:
self._stop_recording()
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.stream = None
if self.audio:
self.audio.terminate()
self.audio = None
def main():
"""主函数"""
print("🚀 简化唤醒+录音测试")
print("=" * 50)
# 检查模型
model_dir = "model"
if not os.path.exists(model_dir):
print("⚠️ 未找到模型目录")
print("请下载 Vosk 模型到 model 目录")
return
# 创建系统
system = SimpleWakeAndRecord(
model_path=model_dir,
wake_words=["你好", "助手", "小爱"]
)
if not system.model:
print("❌ 模型加载失败")
return
print("✅ 系统初始化成功")
print("📖 使用说明:")
print("1. 说唤醒词开始录音")
print("2. 基于语音识别判断3秒没有识别到文字就结束")
print("3. 最少录音2秒最多30秒")
print("4. 录音时实时显示识别结果")
print("5. 录音文件自动保存")
print("6. 录音完成后自动播放刚才录制的内容")
print("7. 按 Ctrl+C 退出")
print("🚀 激进性能优化已启用:")
print(" - 采样率: 8kHz (降低50%数据量)")
print(" - 块大小: 4096字节 (4倍于原始大小)")
print(" - 激进模式: 已启用 (直接处理,跳过部分结果)")
print(" - 批处理: 3个音频块/次")
print(" - 处理间隔: 0.2秒")
print(" - 缓冲区: 5个音频块")
print(" - 词级识别: 已禁用 (提高性能)")
print(" - 性能监控: 每3秒显示")
print(" - 延迟监控: 实时显示")
print(" - 预期延迟: <1秒 (原10秒)")
print("=" * 50)
# 开始运行
system.start()
if __name__ == "__main__":
main()