This commit is contained in:
朱潮 2025-09-20 12:53:58 +08:00
parent 072bb0e6b0
commit df9616b48a
5 changed files with 94 additions and 1474 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@ -1,501 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
集成语音识别的唤醒+录音系统
基于 simple_wake_and_record.py添加语音识别功能
"""
import sys
import os
import time
import threading
import pyaudio
import json
import asyncio
from typing import Optional, List
# 添加当前目录到路径
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
try:
from vosk import Model, KaldiRecognizer
VOSK_AVAILABLE = True
except ImportError:
VOSK_AVAILABLE = False
print("⚠️ Vosk 未安装,请运行: pip install vosk")
from speech_recognizer import SpeechRecognizer, RecognitionResult
class EnhancedWakeAndRecord:
"""增强的唤醒+录音系统,集成语音识别"""
def __init__(self, model_path="model", wake_words=["你好", "助手"],
enable_speech_recognition=True, app_key=None, access_key=None):
self.model_path = model_path
self.wake_words = wake_words
self.enable_speech_recognition = enable_speech_recognition
self.model = None
self.recognizer = None
self.audio = None
self.stream = None
self.running = False
# 音频参数
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 16000
self.CHUNK_SIZE = 1024
# 录音相关
self.recording = False
self.recorded_frames = []
self.last_text_time = None
self.recording_start_time = None
self.recording_recognizer = None
# 阈值
self.text_silence_threshold = 3.0
self.min_recording_time = 2.0
self.max_recording_time = 30.0
# 语音识别相关
self.speech_recognizer = None
self.last_recognition_result = None
self.recognition_thread = None
# 回调函数
self.on_recognition_result = None
self._setup_model()
self._setup_audio()
self._setup_speech_recognition(app_key, access_key)
def _setup_model(self):
"""设置 Vosk 模型"""
if not VOSK_AVAILABLE:
return
try:
if not os.path.exists(self.model_path):
print(f"模型路径不存在: {self.model_path}")
return
self.model = Model(self.model_path)
self.recognizer = KaldiRecognizer(self.model, self.RATE)
self.recognizer.SetWords(True)
print(f"✅ Vosk 模型加载成功")
except Exception as e:
print(f"模型初始化失败: {e}")
def _setup_audio(self):
"""设置音频设备"""
try:
if self.audio is None:
self.audio = pyaudio.PyAudio()
if self.stream is None:
self.stream = self.audio.open(
format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK_SIZE
)
print("✅ 音频设备初始化成功")
except Exception as e:
print(f"音频设备初始化失败: {e}")
def _setup_speech_recognition(self, app_key=None, access_key=None):
"""设置语音识别"""
if not self.enable_speech_recognition:
return
try:
self.speech_recognizer = SpeechRecognizer(
app_key=app_key,
access_key=access_key
)
print("✅ 语音识别器初始化成功")
except Exception as e:
print(f"语音识别器初始化失败: {e}")
self.enable_speech_recognition = False
def _calculate_energy(self, audio_data):
"""计算音频能量"""
if len(audio_data) == 0:
return 0
import numpy as np
audio_array = np.frombuffer(audio_data, dtype=np.int16)
rms = np.sqrt(np.mean(audio_array ** 2))
return rms
def _check_wake_word(self, text):
"""检查是否包含唤醒词"""
if not text or not self.wake_words:
return False, None
text_lower = text.lower()
for wake_word in self.wake_words:
if wake_word.lower() in text_lower:
return True, wake_word
return False, None
def _save_recording(self, audio_data):
"""保存录音"""
timestamp = time.strftime("%Y%m%d_%H%M%S")
filename = f"recording_{timestamp}.wav"
try:
import wave
with wave.open(filename, 'wb') as wf:
wf.setnchannels(self.CHANNELS)
wf.setsampwidth(self.audio.get_sample_size(self.FORMAT))
wf.setframerate(self.RATE)
wf.writeframes(audio_data)
print(f"✅ 录音已保存: {filename}")
return True, filename
except Exception as e:
print(f"保存录音失败: {e}")
return False, None
def _play_audio(self, filename):
"""播放音频文件"""
try:
import wave
# 打开音频文件
with wave.open(filename, 'rb') as wf:
# 获取音频参数
channels = wf.getnchannels()
width = wf.getsampwidth()
rate = wf.getframerate()
total_frames = wf.getnframes()
# 分块读取音频数据,避免内存问题
chunk_size = 1024
frames = []
for _ in range(0, total_frames, chunk_size):
chunk = wf.readframes(chunk_size)
if chunk:
frames.append(chunk)
else:
break
# 创建播放流
playback_stream = self.audio.open(
format=self.audio.get_format_from_width(width),
channels=channels,
rate=rate,
output=True
)
print(f"🔊 开始播放: {filename}")
# 分块播放音频
for chunk in frames:
playback_stream.write(chunk)
# 等待播放完成
playback_stream.stop_stream()
playback_stream.close()
print("✅ 播放完成")
except Exception as e:
print(f"❌ 播放失败: {e}")
self._play_with_system_player(filename)
def _play_with_system_player(self, filename):
"""使用系统播放器播放音频"""
try:
import platform
import subprocess
system = platform.system()
if system == 'Darwin': # macOS
cmd = ['afplay', filename]
elif system == 'Windows':
cmd = ['start', '/min', filename]
else: # Linux
cmd = ['aplay', filename]
print(f"🔊 使用系统播放器: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
print("✅ 播放完成")
except Exception as e:
print(f"❌ 系统播放器也失败: {e}")
print(f"💡 文件已保存,请手动播放: {filename}")
def _start_recognition_thread(self, filename):
"""启动语音识别线程"""
if not self.enable_speech_recognition or not self.speech_recognizer:
return
def recognize_task():
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
print(f"🧠 开始识别录音文件: {filename}")
result = loop.run_until_complete(
self.speech_recognizer.recognize_file(filename)
)
if result:
# 合并所有识别结果
full_text = " ".join([r.text for r in result])
final_result = RecognitionResult(
text=full_text,
confidence=0.9,
is_final=True
)
self.last_recognition_result = final_result
print(f"\n🧠 语音识别结果: {full_text}")
# 调用回调函数
if self.on_recognition_result:
self.on_recognition_result(final_result)
else:
print(f"\n🧠 语音识别失败或未识别到内容")
loop.close()
except Exception as e:
print(f"❌ 语音识别线程异常: {e}")
self.recognition_thread = threading.Thread(target=recognize_task)
self.recognition_thread.daemon = True
self.recognition_thread.start()
def _start_recording(self):
"""开始录音"""
print("🎙️ 开始录音,请说话...")
self.recording = True
self.recorded_frames = []
self.last_text_time = None
self.recording_start_time = time.time()
# 为录音创建一个新的识别器
if self.model:
self.recording_recognizer = KaldiRecognizer(self.model, self.RATE)
self.recording_recognizer.SetWords(True)
def _stop_recording(self):
"""停止录音"""
if len(self.recorded_frames) > 0:
audio_data = b''.join(self.recorded_frames)
duration = len(audio_data) / (self.RATE * 2)
print(f"📝 录音完成,时长: {duration:.2f}")
# 保存录音
success, filename = self._save_recording(audio_data)
# 如果保存成功,播放录音并进行语音识别
if success and filename:
print("=" * 50)
print("🔊 播放刚才录制的音频...")
self._play_audio(filename)
print("=" * 50)
# 启动语音识别
if self.enable_speech_recognition:
print("🧠 准备进行语音识别...")
self._start_recognition_thread(filename)
self.recording = False
self.recorded_frames = []
self.last_text_time = None
self.recording_start_time = None
self.recording_recognizer = None
def set_recognition_callback(self, callback):
"""设置识别结果回调函数"""
self.on_recognition_result = callback
def get_last_recognition_result(self) -> Optional[RecognitionResult]:
"""获取最后一次识别结果"""
return self.last_recognition_result
def start(self):
"""开始唤醒词检测和录音"""
if not self.stream:
print("❌ 音频设备未初始化")
return
self.running = True
print("🎤 开始监听...")
print(f"唤醒词: {', '.join(self.wake_words)}")
if self.enable_speech_recognition:
print("🧠 语音识别: 已启用")
else:
print("🧠 语音识别: 已禁用")
try:
while self.running:
# 读取音频数据
data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False)
if len(data) == 0:
continue
if self.recording:
# 录音模式
self.recorded_frames.append(data)
recording_duration = time.time() - self.recording_start_time
# 使用录音专用的识别器进行实时识别
if self.recording_recognizer:
if self.recording_recognizer.AcceptWaveform(data):
result = json.loads(self.recording_recognizer.Result())
text = result.get('text', '').strip()
if text:
self.last_text_time = time.time()
print(f"\n📝 实时识别: {text}")
else:
partial_result = json.loads(self.recording_recognizer.PartialResult())
partial_text = partial_result.get('partial', '').strip()
if partial_text:
self.last_text_time = time.time()
status = f"录音中... {recording_duration:.1f}s | {partial_text}"
print(f"\r{status}", end='', flush=True)
# 检查是否需要结束录音
current_time = time.time()
if self.last_text_time is not None:
text_silence_duration = current_time - self.last_text_time
if text_silence_duration > self.text_silence_threshold and recording_duration >= self.min_recording_time:
print(f"\n\n3秒没有识别到文字结束录音")
self._stop_recording()
else:
if recording_duration > 5.0:
print(f"\n\n5秒没有识别到文字结束录音")
self._stop_recording()
# 检查最大录音时间
if recording_duration > self.max_recording_time:
print(f"\n\n达到最大录音时间 {self.max_recording_time}s")
self._stop_recording()
# 显示录音状态
if self.last_text_time is None:
status = f"等待语音输入... {recording_duration:.1f}s"
print(f"\r{status}", end='', flush=True)
elif self.model and self.recognizer:
# 唤醒词检测模式
if self.recognizer.AcceptWaveform(data):
result = json.loads(self.recognizer.Result())
text = result.get('text', '').strip()
if text:
print(f"识别: {text}")
# 检查唤醒词
is_wake_word, detected_word = self._check_wake_word(text)
if is_wake_word:
print(f"🎯 检测到唤醒词: {detected_word}")
self._start_recording()
else:
# 显示实时音频级别
energy = self._calculate_energy(data)
if energy > 50:
partial_result = json.loads(self.recognizer.PartialResult())
partial_text = partial_result.get('partial', '')
if partial_text:
status = f"监听中... 能量: {energy:.0f} | {partial_text}"
else:
status = f"监听中... 能量: {energy:.0f}"
print(status, end='\r')
time.sleep(0.01)
except KeyboardInterrupt:
print("\n👋 退出")
except Exception as e:
print(f"错误: {e}")
finally:
self.stop()
def stop(self):
"""停止"""
self.running = False
if self.recording:
self._stop_recording()
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.stream = None
if self.audio:
self.audio.terminate()
self.audio = None
# 等待识别线程结束
if self.recognition_thread and self.recognition_thread.is_alive():
self.recognition_thread.join(timeout=5.0)
def main():
"""主函数"""
print("🚀 增强版唤醒+录音+语音识别测试")
print("=" * 50)
# 检查模型
model_dir = "model"
if not os.path.exists(model_dir):
print("⚠️ 未找到模型目录")
print("请下载 Vosk 模型到 model 目录")
return
# 创建系统
system = EnhancedWakeAndRecord(
model_path=model_dir,
wake_words=["你好", "助手", "小爱"],
enable_speech_recognition=True,
# app_key="your_app_key", # 请填入实际的app_key
# access_key="your_access_key" # 请填入实际的access_key
)
if not system.model:
print("❌ 模型加载失败")
return
# 设置识别结果回调
def on_recognition_result(result):
print(f"\n🎯 识别完成!结果: {result.text}")
print(f" 置信度: {result.confidence}")
print(f" 是否最终结果: {result.is_final}")
system.set_recognition_callback(on_recognition_result)
print("✅ 系统初始化成功")
print("📖 使用说明:")
print("1. 说唤醒词开始录音")
print("2. 基于语音识别判断3秒没有识别到文字就结束")
print("3. 最少录音2秒最多30秒")
print("4. 录音时实时显示识别结果")
print("5. 录音文件自动保存")
print("6. 录音完成后自动播放刚才录制的内容")
print("7. 启动语音识别对录音文件进行识别")
print("8. 按 Ctrl+C 退出")
print("=" * 50)
# 开始运行
system.start()
if __name__ == "__main__":
main()

View File

@ -39,7 +39,8 @@ class EnergyBasedRecorder:
self.recorded_frames = []
self.recording_start_time = None
self.last_sound_time = None
self.energy_history = [] # 能量历史
self.energy_history = []
self.zcr_history = [] # ZCR历史
self.max_energy_history = 50 # 最大能量历史记录
# 预录音缓冲区
@ -49,6 +50,17 @@ class EnergyBasedRecorder:
# 播放状态
self.is_playing = False # 是否正在播放
# 智能静音检测
self.voice_activity_history = [] # 语音活动历史
self.max_voice_history = 20 # 最大语音活动历史记录
self.consecutive_silence_count = 0 # 连续静音计数
self.silence_threshold_count = 15 # 连续静音次数阈值约1.5秒)
# 智能ZCR静音检测
self.max_zcr_history = 30 # 最大ZCR历史记录
self.consecutive_low_zcr_count = 0 # 连续低ZCR计数
self.low_zcr_threshold_count = 20 # 连续低ZCR次数阈值约2秒
# 性能监控
self.frame_count = 0
self.start_time = time.time()
@ -100,7 +112,7 @@ class EnergyBasedRecorder:
return peak_energy
def calculate_zero_crossing_rate(self, audio_data):
"""计算零交叉率(辅助判断语音"""
"""计算零交叉率(主要语音检测方法"""
if len(audio_data) == 0:
return 0
@ -112,40 +124,27 @@ class EnergyBasedRecorder:
# 归一化到采样率
zcr = zero_crossings / len(audio_array) * self.RATE
# 更新ZCR历史
self.zcr_history.append(zcr)
if len(self.zcr_history) > self.max_zcr_history:
self.zcr_history.pop(0)
return zcr
def is_voice_active_advanced(self, energy, zcr):
"""高级语音活动检测"""
# 动态阈值:基于背景噪音
if len(self.energy_history) >= 10:
# 使用最近10个样本的中位数作为背景噪音
background_energy = np.median(self.energy_history[-10:])
# 动态阈值:背景噪音 + 50%(提高敏感性)
dynamic_threshold = max(50, background_energy * 1.5)
# 能量条件
energy_condition = energy > dynamic_threshold
# 零交叉率条件语音通常在1000-5000 Hz之间
# 对于8kHz采样率ZCR通常在1000-4000之间
zcr_condition = 1000 < zcr < 4000
# 同时满足能量和ZCR条件才认为是语音
return energy_condition and zcr_condition
else:
# 初始阶段使用固定阈值
return energy > 60 and zcr > 1000 # 更严格的初始条件
def get_average_energy(self):
"""获取平均能量水平"""
if not self.energy_history:
return 0
return np.mean(self.energy_history)
"""仅使用ZCR进行语音活动检测"""
# ZCR语音检测提高到1200-6000 Hz之间更好地区分语音和环境噪音
# 说话时ZCR会比较稳定在这个范围内
zcr_condition = 1200 < zcr < 6000
# 添加一些容错避免短暂的ZCR波动导致误判
return zcr_condition
def is_voice_active(self, energy):
"""判断是否有人声"""
return energy > self.energy_threshold
"""已弃用 - 仅用于兼容性"""
# 现在主要使用ZCR检测这个方法保留但不再使用
return False
def save_recording(self, audio_data, filename=None):
"""保存录音"""
@ -181,6 +180,7 @@ class EnergyBasedRecorder:
# 清空所有缓冲区
self.pre_record_buffer = []
self.energy_history = []
self.zcr_history = []
# 完全关闭输入流
if self.stream:
@ -247,6 +247,7 @@ class EnergyBasedRecorder:
# 重置所有状态
self.energy_history = []
self.zcr_history = []
print("📡 音频输入已重新开启")
def play_with_system_player(self, filename):
@ -277,6 +278,7 @@ class EnergyBasedRecorder:
# 清空所有缓冲区
self.pre_record_buffer = []
self.energy_history = []
self.zcr_history = []
# 完全关闭输入流
if self.stream:
@ -312,6 +314,7 @@ class EnergyBasedRecorder:
# 重置所有状态
self.energy_history = []
self.zcr_history = []
print("📡 音频输入已重新开启")
def update_pre_record_buffer(self, audio_data):
@ -336,7 +339,13 @@ class EnergyBasedRecorder:
self.recording_start_time = time.time()
self.last_sound_time = time.time()
self.energy_history = [] # 重置能量历史
self.energy_history = []
self.zcr_history = [] # 重置ZCR历史
# 重置ZCR相关计数器
self.consecutive_low_zcr_count = 0
self.consecutive_silence_count = 0
self.voice_activity_history = []
def stop_recording(self):
"""停止录音"""
@ -366,6 +375,7 @@ class EnergyBasedRecorder:
self.recording_start_time = None
self.last_sound_time = None
self.energy_history = []
self.zcr_history = []
def monitor_performance(self):
"""性能监控"""
@ -397,7 +407,7 @@ class EnergyBasedRecorder:
self.running = True
print("🎤 开始监听...")
print(f"能量阈值: {self.energy_threshold}")
print(f"能量阈值: {self.energy_threshold} (已弃用)")
print(f"静音阈值: {self.silence_threshold}")
print("📖 使用说明:")
print("- 检测到声音自动开始录音")
@ -406,11 +416,16 @@ class EnergyBasedRecorder:
print("- 录音完成后自动播放")
print("- 按 Ctrl+C 退出")
print("🎯 新增功能:")
print("- 动态阈值调整(基于背景噪音")
print("- 纯ZCR语音检测移除能量检测")
print("- 零交叉率检测(区分语音和噪音)")
print("- 实时显示ZCR和背景能量")
print("- 实时显示ZCR状态")
print("- 预录音功能包含声音开始前2秒")
print("- 环形缓冲区防止丢失开头音频")
print("🤖 纯ZCR静音检测:")
print("- 连续低ZCR计数20次=2秒")
print("- ZCR活动历史追踪")
print("- 基于ZCR模式的静音验证")
print("- 语音范围: 1200-6000 Hz (提高阈值)")
print("=" * 50)
try:
@ -442,28 +457,60 @@ class EnergyBasedRecorder:
self.recorded_frames.append(data)
recording_duration = time.time() - self.recording_start_time
# 更新最后声音时间
# 基于ZCR的智能静音检测
if self.is_voice_active_advanced(energy, zcr):
self.last_sound_time = time.time()
self.consecutive_low_zcr_count = 0 # 重置低ZCR计数
self.consecutive_silence_count = 0 # 重置静音计数
else:
self.consecutive_low_zcr_count += 1 # 增加低ZCR计数
self.consecutive_silence_count += 1 # 增加静音计数
# 更新ZCR活动历史基于ZCR是否在语音范围内
self.voice_activity_history.append(1200 < zcr < 6000)
if len(self.voice_activity_history) > self.max_voice_history:
self.voice_activity_history.pop(0)
# 检查是否应该结束录音
current_time = time.time()
# 检查静音超时
if current_time - self.last_sound_time > self.silence_threshold:
if recording_duration >= self.min_recording_time:
print(f"\n🔇 检测到持续静音 {self.silence_threshold}秒,结束录音")
self.stop_recording()
# 纯ZCR静音检测
should_stop = False
stop_reason = ""
# 主要检测连续低ZCR计数
if self.consecutive_low_zcr_count >= self.low_zcr_threshold_count:
# 进一步验证检查最近的ZCR活动历史
if len(self.voice_activity_history) >= 15:
recent_voice_activity = sum(self.voice_activity_history[-15:])
if recent_voice_activity <= 3: # 最近15个样本中最多3个有语音活动
should_stop = True
stop_reason = f"ZCR静音检测 ({self.consecutive_low_zcr_count}次连续低ZCR)"
else:
# 如果历史数据不足,使用基础检测
should_stop = True
stop_reason = f"基础ZCR静音检测 ({self.consecutive_low_zcr_count}次)"
# 备用检测:基于时间的静音检测
if not should_stop and current_time - self.last_sound_time > self.silence_threshold:
should_stop = True
stop_reason = f"时间静音检测 ({self.silence_threshold}秒)"
# 执行停止录音
if should_stop and recording_duration >= self.min_recording_time:
print(f"\n🔇 {stop_reason},结束录音")
self.stop_recording()
# 检查最大录音时间
if recording_duration > self.max_recording_time:
print(f"\n⏰ 达到最大录音时间 {self.max_recording_time}")
self.stop_recording()
# 显示录音状态(包含调试信息)
bg_energy = np.median(self.energy_history[-10:]) if len(self.energy_history) >= 10 else 0
# 显示录音状态仅ZCR相关信息
is_voice = self.is_voice_active_advanced(energy, zcr)
status = f"录音中... {recording_duration:.1f}s | RMS: {energy:.0f} | 峰值: {peak_energy:.0f} | ZCR: {zcr:.0f} | 语音: {is_voice}"
zcr_progress = f"{self.consecutive_low_zcr_count}/{self.low_zcr_threshold_count}"
recent_activity = sum(self.voice_activity_history[-5:]) if len(self.voice_activity_history) >= 5 else 0
status = f"录音中... {recording_duration:.1f}s | ZCR: {zcr:.0f} | 语音: {is_voice} | 低ZCR计数: {zcr_progress} | 活动: {recent_activity}"
print(f"\r{status}", end='', flush=True)
else:
@ -475,12 +522,10 @@ class EnergyBasedRecorder:
# 检测到声音,开始录音
self.start_recording()
else:
# 显示监听状态(包含调试信息)
avg_energy = self.get_average_energy()
bg_energy = np.median(self.energy_history[-10:]) if len(self.energy_history) >= 10 else 0
buffer_usage = len(self.pre_record_buffer) / self.pre_record_max_frames * 100
# 显示监听状态仅ZCR相关信息
is_voice = self.is_voice_active_advanced(energy, zcr)
status = f"监听中... RMS: {energy:.0f} | 峰值: {peak_energy:.0f} | ZCR: {zcr:.0f} | 背景: {bg_energy:.0f} | 语音: {is_voice} | 缓冲: {buffer_usage:.0f}%"
buffer_usage = len(self.pre_record_buffer) / self.pre_record_max_frames * 100
status = f"监听中... ZCR: {zcr:.0f} | 语音: {is_voice} | 缓冲: {buffer_usage:.0f}%"
print(f"\r{status}", end='', flush=True)
# 减少CPU使用

View File

@ -1,580 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
简化的唤醒+录音测试
专注于解决音频冲突问题
"""
import sys
import os
import time
import threading
import pyaudio
import json
# 添加当前目录到路径
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
try:
from vosk import Model, KaldiRecognizer
VOSK_AVAILABLE = True
except ImportError:
VOSK_AVAILABLE = False
print("⚠️ Vosk 未安装,请运行: pip install vosk")
class SimpleWakeAndRecord:
"""简化的唤醒+录音系统"""
def __init__(self, model_path="model", wake_words=["你好", "助手"]):
self.model_path = model_path
self.wake_words = wake_words
self.model = None
self.recognizer = None
self.audio = None
self.stream = None
self.running = False
# 音频参数 - 激进优化为树莓派3B
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 8000 # 从16kHz降至8kHz减少50%数据处理量
self.CHUNK_SIZE = 4096 # 进一步增大块大小到4KB大幅减少处理次数
self.AGGRESSIVE_MODE = True # 激进优化模式
# 录音相关
self.recording = False
self.recorded_frames = []
self.last_text_time = None # 最后一次识别到文字的时间
self.recording_start_time = None
self.recording_recognizer = None # 录音时专用的识别器
# 性能优化相关 - 激进优化
self.audio_buffer = [] # 音频缓冲区
self.buffer_size = 5 # 减小缓冲区大小,减少内存使用
self.last_process_time = time.time() # 上次处理时间
self.process_interval = 0.2 # 缩短处理间隔,提高响应速度
self.batch_process_size = 3 # 减少批处理大小,更快处理
self.skip_partial_results = True # 跳过部分识别结果,只处理最终结果
# 性能监控
self.process_count = 0
self.avg_process_time = 0
self.last_monitor_time = time.time()
self.monitor_interval = 3.0 # 缩短监控间隔
# 延迟监控
self.audio_receive_times = [] # 音频接收时间戳
self.process_start_times = [] # 处理开始时间
self.latency_samples = [] # 延迟样本
self.max_latency_samples = 10 # 最大延迟样本数
# 阈值
self.text_silence_threshold = 3.0 # 3秒没有识别到文字就结束
self.min_recording_time = 2.0 # 最小录音时间
self.max_recording_time = 30.0 # 最大录音时间
self._setup_model()
self._setup_audio()
def _setup_model(self):
"""设置 Vosk 模型"""
if not VOSK_AVAILABLE:
return
try:
if not os.path.exists(self.model_path):
print(f"模型路径不存在: {self.model_path}")
return
print(f"🔄 正在加载模型,这可能需要一些时间...")
start_time = time.time()
self.model = Model(self.model_path)
self.recognizer = KaldiRecognizer(self.model, self.RATE)
# 激进模式:禁用词级识别以提高性能
if self.AGGRESSIVE_MODE:
self.recognizer.SetWords(False)
print(f"📉 激进模式:已禁用词级识别以提高性能")
else:
self.recognizer.SetWords(True)
load_time = time.time() - start_time
print(f"✅ Vosk 模型加载成功 (耗时: {load_time:.2f}s)")
except Exception as e:
print(f"模型初始化失败: {e}")
def _setup_audio(self):
"""设置音频设备"""
try:
if self.audio is None:
self.audio = pyaudio.PyAudio()
if self.stream is None:
self.stream = self.audio.open(
format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK_SIZE
)
print("✅ 音频设备初始化成功")
except Exception as e:
print(f"音频设备初始化失败: {e}")
def _calculate_energy(self, audio_data):
"""计算音频能量"""
if len(audio_data) == 0:
return 0
import numpy as np
audio_array = np.frombuffer(audio_data, dtype=np.int16)
rms = np.sqrt(np.mean(audio_array ** 2))
return rms
def _check_wake_word(self, text):
"""检查是否包含唤醒词"""
if not text or not self.wake_words:
return False, None
text_lower = text.lower()
for wake_word in self.wake_words:
if wake_word.lower() in text_lower:
return True, wake_word
return False, None
def _should_process_audio(self):
"""判断是否应该处理音频"""
current_time = time.time()
return (current_time - self.last_process_time >= self.process_interval and
len(self.audio_buffer) >= self.batch_process_size)
def _process_audio_batch(self):
"""批量处理音频数据"""
if len(self.audio_buffer) < self.batch_process_size:
return
# 记录处理开始时间
start_time = time.time()
self.process_start_times.append(start_time)
# 取出批处理数据
batch_data = self.audio_buffer[:self.batch_process_size]
self.audio_buffer = self.audio_buffer[self.batch_process_size:]
# 合并音频数据
combined_data = b''.join(batch_data)
# 更新处理时间
self.last_process_time = time.time()
# 更新性能统计
process_time = time.time() - start_time
self.process_count += 1
self.avg_process_time = (self.avg_process_time * (self.process_count - 1) + process_time) / self.process_count
# 性能监控
self._monitor_performance()
return combined_data
def _monitor_performance(self):
"""性能监控"""
current_time = time.time()
if current_time - self.last_monitor_time >= self.monitor_interval:
buffer_usage = len(self.audio_buffer) / self.buffer_size * 100
# 计算平均延迟
avg_latency = 0
if self.latency_samples:
avg_latency = sum(self.latency_samples) / len(self.latency_samples)
print(f"\n📊 性能监控 | 处理次数: {self.process_count} | 平均处理时间: {self.avg_process_time:.3f}s | 缓冲区使用: {buffer_usage:.1f}% | 平均延迟: {avg_latency:.2f}s")
self.last_monitor_time = current_time
def _calculate_latency(self, audio_time):
"""计算音频延迟"""
current_time = time.time()
latency = current_time - audio_time
# 添加到延迟样本
self.latency_samples.append(latency)
if len(self.latency_samples) > self.max_latency_samples:
self.latency_samples.pop(0)
return latency
def _lightweight_recognition(self, recognizer, audio_data):
"""轻量级识别处理"""
if not recognizer:
return None
# 激进模式:跳过部分识别结果,只处理最终结果
if self.skip_partial_results:
if recognizer.AcceptWaveform(audio_data):
result = json.loads(recognizer.Result())
return result.get('text', '').strip()
else:
# 标准模式:处理部分和最终结果
if recognizer.AcceptWaveform(audio_data):
result = json.loads(recognizer.Result())
return result.get('text', '').strip()
else:
partial_result = json.loads(recognizer.PartialResult())
return partial_result.get('partial', '').strip()
return None
def _save_recording(self, audio_data):
"""保存录音"""
timestamp = time.strftime("%Y%m%d_%H%M%S")
filename = f"recording_{timestamp}.wav"
try:
import wave
with wave.open(filename, 'wb') as wf:
wf.setnchannels(self.CHANNELS)
wf.setsampwidth(self.audio.get_sample_size(self.FORMAT))
wf.setframerate(self.RATE)
wf.writeframes(audio_data)
print(f"✅ 录音已保存: {filename}")
return True, filename
except Exception as e:
print(f"保存录音失败: {e}")
return False, None
def _play_audio(self, filename):
"""播放音频文件"""
try:
import wave
# 打开音频文件
with wave.open(filename, 'rb') as wf:
# 获取音频参数
channels = wf.getnchannels()
width = wf.getsampwidth()
rate = wf.getframerate()
total_frames = wf.getnframes()
# 分块读取音频数据,避免内存问题
chunk_size = 1024
frames = []
for _ in range(0, total_frames, chunk_size):
chunk = wf.readframes(chunk_size)
if chunk:
frames.append(chunk)
else:
break
# 创建播放流
playback_stream = self.audio.open(
format=self.audio.get_format_from_width(width),
channels=channels,
rate=rate,
output=True
)
print(f"🔊 开始播放: {filename}")
# 分块播放音频
for chunk in frames:
playback_stream.write(chunk)
# 等待播放完成
playback_stream.stop_stream()
playback_stream.close()
print("✅ 播放完成")
except Exception as e:
print(f"❌ 播放失败: {e}")
# 如果pyaudio播放失败尝试用系统命令播放
self._play_with_system_player(filename)
def _play_with_system_player(self, filename):
"""使用系统播放器播放音频"""
try:
import platform
import subprocess
system = platform.system()
if system == 'Darwin': # macOS
cmd = ['afplay', filename]
elif system == 'Windows':
cmd = ['start', '/min', filename]
else: # Linux
cmd = ['aplay', filename]
print(f"🔊 使用系统播放器: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
print("✅ 播放完成")
except Exception as e:
print(f"❌ 系统播放器也失败: {e}")
print(f"💡 文件已保存,请手动播放: {filename}")
def _start_recording(self):
"""开始录音"""
print("🎙️ 开始录音,请说话...")
self.recording = True
self.recorded_frames = []
self.last_text_time = None
self.recording_start_time = time.time()
# 为录音创建一个新的识别器
if self.model:
self.recording_recognizer = KaldiRecognizer(self.model, self.RATE)
# 激进模式:禁用词级识别以提高性能
if self.AGGRESSIVE_MODE:
self.recording_recognizer.SetWords(False)
else:
self.recording_recognizer.SetWords(True)
def _stop_recording(self):
"""停止录音"""
if len(self.recorded_frames) > 0:
audio_data = b''.join(self.recorded_frames)
duration = len(audio_data) / (self.RATE * 2)
print(f"📝 录音完成,时长: {duration:.2f}")
# 保存录音
success, filename = self._save_recording(audio_data)
# 如果保存成功,播放录音
if success and filename:
print("=" * 50)
print("🔊 播放刚才录制的音频...")
self._play_audio(filename)
print("=" * 50)
self.recording = False
self.recorded_frames = []
self.last_text_time = None
self.recording_start_time = None
self.recording_recognizer = None
def start(self):
"""开始唤醒词检测和录音"""
if not self.stream:
print("❌ 音频设备未初始化")
return
self.running = True
print("🎤 开始监听...")
print(f"唤醒词: {', '.join(self.wake_words)}")
try:
while self.running:
# 读取音频数据
receive_time = time.time()
data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False)
if len(data) == 0:
continue
# 记录音频接收时间
self.audio_receive_times.append(receive_time)
if len(self.audio_receive_times) > self.max_latency_samples:
self.audio_receive_times.pop(0)
if self.recording:
# 录音模式 - 激进优化处理
self.recorded_frames.append(data)
recording_duration = time.time() - self.recording_start_time
# 录音时使用批处理进行识别
self.audio_buffer.append(data)
# 限制缓冲区大小
if len(self.audio_buffer) > self.buffer_size:
self.audio_buffer.pop(0)
# 激进模式:直接处理,不等待批处理
if self.AGGRESSIVE_MODE and self.recording_recognizer:
# 直接处理当前音频块
text = self._lightweight_recognition(self.recording_recognizer, data)
if text:
# 计算延迟
if self.audio_receive_times:
latency = self._calculate_latency(self.audio_receive_times[0])
self.audio_receive_times.pop(0)
# 识别到文字,更新时间戳
self.last_text_time = time.time()
print(f"\n📝 识别: {text} (延迟: {latency:.2f}s)")
else:
# 标准批处理模式
if self._should_process_audio() and self.recording_recognizer:
combined_data = self._process_audio_batch()
if combined_data:
text = self._lightweight_recognition(self.recording_recognizer, combined_data)
if text:
# 计算延迟
if self.process_start_times:
process_start = self.process_start_times[0]
self.process_start_times.pop(0)
if self.audio_receive_times:
audio_time = self.audio_receive_times[0]
self.audio_receive_times.pop(0)
latency = process_start - audio_time
self._calculate_latency(audio_time)
self.last_text_time = time.time()
print(f"\n📝 识别: {text}")
# 检查是否需要结束录音
current_time = time.time()
# 激进模式:缩短超时时间
timeout_duration = 2.0 if self.AGGRESSIVE_MODE else 5.0
if self.last_text_time is not None:
text_silence_duration = current_time - self.last_text_time
if text_silence_duration > self.text_silence_threshold and recording_duration >= self.min_recording_time:
print(f"\n\n3秒没有识别到文字结束录音")
self._stop_recording()
else:
# 还没有识别到任何文字,检查是否超时
if recording_duration > timeout_duration:
print(f"\n\n{timeout_duration}秒没有识别到文字,结束录音")
self._stop_recording()
# 检查最大录音时间
if recording_duration > self.max_recording_time:
print(f"\n\n达到最大录音时间 {self.max_recording_time}s")
self._stop_recording()
# 显示录音状态
if self.last_text_time is None:
status = f"等待语音输入... {recording_duration:.1f}s"
print(f"\r{status}", end='', flush=True)
elif self.model and self.recognizer:
# 唤醒词检测模式 - 激进优化
if self.AGGRESSIVE_MODE:
# 直接处理,不使用缓冲区
text = self._lightweight_recognition(self.recognizer, data)
if text:
print(f"识别: {text}")
# 检查唤醒词
is_wake_word, detected_word = self._check_wake_word(text)
if is_wake_word:
print(f"🎯 检测到唤醒词: {detected_word}")
self._start_recording()
# 显示实时音频级别(仅在高能量时)
energy = self._calculate_energy(data)
if energy > 100: # 提高阈值,减少显示频率
status = f"监听中... 能量: {energy:.0f}"
print(status, end='\r')
else:
# 标准批处理模式
self.audio_buffer.append(data)
# 限制缓冲区大小
if len(self.audio_buffer) > self.buffer_size:
self.audio_buffer.pop(0)
# 批处理识别
if self._should_process_audio():
combined_data = self._process_audio_batch()
if combined_data:
text = self._lightweight_recognition(self.recognizer, combined_data)
if text:
print(f"识别: {text}")
# 检查唤醒词
is_wake_word, detected_word = self._check_wake_word(text)
if is_wake_word:
print(f"🎯 检测到唤醒词: {detected_word}")
self._start_recording()
# 显示实时音频级别
energy = self._calculate_energy(data)
if energy > 50:
status = f"监听中... 能量: {energy:.0f}"
print(status, end='\r')
# 激进模式更长的延迟以减少CPU使用
sleep_time = 0.1 if self.AGGRESSIVE_MODE else 0.05
time.sleep(sleep_time)
except KeyboardInterrupt:
print("\n👋 退出")
except Exception as e:
print(f"错误: {e}")
finally:
self.stop()
def stop(self):
"""停止"""
self.running = False
if self.recording:
self._stop_recording()
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.stream = None
if self.audio:
self.audio.terminate()
self.audio = None
def main():
"""主函数"""
print("🚀 简化唤醒+录音测试")
print("=" * 50)
# 检查模型
model_dir = "model"
if not os.path.exists(model_dir):
print("⚠️ 未找到模型目录")
print("请下载 Vosk 模型到 model 目录")
return
# 创建系统
system = SimpleWakeAndRecord(
model_path=model_dir,
wake_words=["你好", "助手", "小爱"]
)
if not system.model:
print("❌ 模型加载失败")
return
print("✅ 系统初始化成功")
print("📖 使用说明:")
print("1. 说唤醒词开始录音")
print("2. 基于语音识别判断3秒没有识别到文字就结束")
print("3. 最少录音2秒最多30秒")
print("4. 录音时实时显示识别结果")
print("5. 录音文件自动保存")
print("6. 录音完成后自动播放刚才录制的内容")
print("7. 按 Ctrl+C 退出")
print("🚀 激进性能优化已启用:")
print(" - 采样率: 8kHz (降低50%数据量)")
print(" - 块大小: 4096字节 (4倍于原始大小)")
print(" - 激进模式: 已启用 (直接处理,跳过部分结果)")
print(" - 批处理: 3个音频块/次")
print(" - 处理间隔: 0.2秒")
print(" - 缓冲区: 5个音频块")
print(" - 词级识别: 已禁用 (提高性能)")
print(" - 性能监控: 每3秒显示")
print(" - 延迟监控: 实时显示")
print(" - 预期延迟: <1秒 (原10秒)")
print("=" * 50)
# 开始运行
system.start()
if __name__ == "__main__":
main()

View File

@ -1,344 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
语音录制模块
基于pyaudio实现支持语音活动检测(VAD)自动判断录音结束
"""
import pyaudio
import wave
import numpy as np
import time
import os
import threading
from collections import deque
class VoiceRecorder:
"""语音录制器,支持自动检测语音结束"""
def __init__(self,
energy_threshold=500,
silence_threshold=1.0,
min_recording_time=0.5,
max_recording_time=10.0,
sample_rate=16000,
chunk_size=1024,
defer_audio_init=False):
"""
初始化录音器
Args:
energy_threshold: 语音能量阈值
silence_threshold: 静音持续时间阈值
min_recording_time: 最小录音时间
max_recording_time: 最大录音时间
sample_rate: 采样率
chunk_size: 音频块大小
defer_audio_init: 是否延迟音频初始化
"""
self.energy_threshold = energy_threshold
self.silence_threshold = silence_threshold
self.min_recording_time = min_recording_time
self.max_recording_time = max_recording_time
self.sample_rate = sample_rate
self.chunk_size = chunk_size
self.defer_audio_init = defer_audio_init
# 音频参数
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
# 状态变量
self.audio = None
self.stream = None
self.recording = False
self.recorded_frames = []
# 语音检测相关
self.silence_start_time = None
self.recording_start_time = None
self.audio_buffer = deque(maxlen=int(sample_rate / chunk_size * 2)) # 2秒缓冲
# 回调函数
self.on_recording_complete = None
self.on_speech_detected = None
if not defer_audio_init:
self._setup_audio()
def _setup_audio(self):
"""设置音频设备"""
try:
self.audio = pyaudio.PyAudio()
# 获取默认输入设备信息
device_info = self.audio.get_default_input_device_info()
print(f"使用音频设备: {device_info['name']}")
except Exception as e:
print(f"音频设备初始化失败: {e}")
raise
def _calculate_energy(self, audio_data):
"""计算音频能量"""
if len(audio_data) == 0:
return 0
# 转换为numpy数组
audio_array = np.frombuffer(audio_data, dtype=np.int16)
# 计算RMS能量
rms = np.sqrt(np.mean(audio_array ** 2))
return rms
def _is_speech(self, audio_data):
"""判断是否为语音"""
energy = self._calculate_energy(audio_data)
return energy > self.energy_threshold
def _open_stream(self):
"""打开音频流"""
if self.stream is not None:
return
self.stream = self.audio.open(
format=self.FORMAT,
channels=self.CHANNELS,
rate=self.sample_rate,
input=True,
frames_per_buffer=self.chunk_size
)
def _close_stream(self):
"""关闭音频流"""
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.stream = None
def start_listening(self):
"""开始监听语音"""
if self.recording:
print("正在录音中...")
return
self._open_stream()
self.recording = True
self.recorded_frames = []
self.silence_start_time = None
self.recording_start_time = None
print("开始监听语音...")
# 在新线程中录音
recording_thread = threading.Thread(target=self._record_loop)
recording_thread.daemon = True
recording_thread.start()
def _record_loop(self):
"""录音循环"""
try:
while self.recording:
# 读取音频数据
data = self.stream.read(self.chunk_size, exception_on_overflow=False)
if len(data) == 0:
continue
# 计算能量
energy = self._calculate_energy(data)
# 添加到缓冲区
self.audio_buffer.append(data)
# 检测语音活动
if energy > self.energy_threshold:
# 检测到语音
if self.recording_start_time is None:
# 开始录音
self.recording_start_time = time.time()
self.silence_start_time = None
self.recorded_frames = list(self.audio_buffer) # 包含之前的音频
print("🎤 检测到语音,开始录音...")
if self.on_speech_detected:
self.on_speech_detected()
# 重置静音计时
self.silence_start_time = None
# 录音
self.recorded_frames.append(data)
elif self.recording_start_time is not None:
# 之前有语音,现在检查是否静音
if self.silence_start_time is None:
self.silence_start_time = time.time()
# 继续录音
self.recorded_frames.append(data)
# 检查是否静音超时
silence_duration = time.time() - self.silence_start_time
if silence_duration > self.silence_threshold:
recording_duration = time.time() - self.recording_start_time
# 检查最小录音时间
if recording_duration >= self.min_recording_time:
print(f"静音 {silence_duration:.1f}s结束录音")
self.stop_recording()
break
else:
print(f"录音时间太短 ({recording_duration:.1f}s),继续等待...")
self.silence_start_time = time.time()
# 检查最大录音时间
if self.recording_start_time is not None:
recording_duration = time.time() - self.recording_start_time
if recording_duration > self.max_recording_time:
print(f"达到最大录音时间 {self.max_recording_time}s结束录音")
self.stop_recording()
break
# 短暂休眠
time.sleep(0.01)
except Exception as e:
print(f"录音过程中发生错误: {e}")
self.stop_recording()
def stop_recording(self):
"""停止录音"""
if not self.recording:
return
self.recording = False
self._close_stream()
if len(self.recorded_frames) > 0:
# 保存录音
audio_data = b''.join(self.recorded_frames)
print(f"录音完成,共 {len(self.recorded_frames)}")
print(f"录音时长: {len(audio_data) / (self.sample_rate * 2):.2f}")
# 调用回调函数
if self.on_recording_complete:
self.on_recording_complete(audio_data)
# 重置状态
self.recorded_frames = []
self.silence_start_time = None
self.recording_start_time = None
def save_audio(self, audio_data, filename):
"""保存音频到文件"""
try:
with wave.open(filename, 'wb') as wf:
wf.setnchannels(self.CHANNELS)
wf.setsampwidth(self.audio.get_sample_size(self.FORMAT))
wf.setframerate(self.sample_rate)
wf.writeframes(audio_data)
print(f"音频已保存到: {filename}")
return True
except Exception as e:
print(f"保存音频失败: {e}")
return False
def set_recording_complete_callback(self, callback):
"""设置录音完成回调函数"""
self.on_recording_complete = callback
def set_speech_detected_callback(self, callback):
"""设置语音检测回调函数"""
self.on_speech_detected = callback
def adjust_sensitivity(self, energy_threshold=None, silence_threshold=None):
"""调整灵敏度"""
if energy_threshold is not None:
self.energy_threshold = energy_threshold
print(f"能量阈值调整为: {energy_threshold}")
if silence_threshold is not None:
self.silence_threshold = silence_threshold
print(f"静音阈值调整为: {silence_threshold}")
def get_audio_level(self):
"""获取当前音频级别"""
if len(self.audio_buffer) > 0:
latest_data = self.audio_buffer[-1]
return self._calculate_energy(latest_data)
return 0
def cleanup(self):
"""清理资源"""
self.stop_recording()
if self.audio:
self.audio.terminate()
self.audio = None
def main():
"""测试录音功能"""
print("🎙️ 语音录制测试")
print("=" * 50)
print("配置:")
print("- 能量阈值: 500")
print("- 静音阈值: 1.0秒")
print("- 最小录音时间: 0.5秒")
print("- 最大录音时间: 10秒")
print("=" * 50)
print("请说话测试录音功能...")
print("按 Ctrl+C 退出")
def on_recording_complete(audio_data):
"""录音完成回调"""
# 保存录音文件
timestamp = time.strftime("%Y%m%d_%H%M%S")
filename = f"recording_{timestamp}.wav"
recorder.save_audio(audio_data, filename)
print(f"✅ 录音文件已保存: {filename}")
# 显示录音信息
duration = len(audio_data) / (recorder.sample_rate * 2)
print(f"录音时长: {duration:.2f}")
def on_speech_detected():
"""检测到语音回调"""
print("🔊 检测到语音活动...")
# 创建录音器
recorder = VoiceRecorder(
energy_threshold=500,
silence_threshold=1.0,
min_recording_time=0.5,
max_recording_time=10.0
)
# 设置回调
recorder.set_recording_complete_callback(on_recording_complete)
recorder.set_speech_detected_callback(on_speech_detected)
try:
# 开始监听
recorder.start_listening()
# 保持程序运行
while True:
time.sleep(0.1)
# 显示当前音频级别(可选)
level = recorder.get_audio_level()
if level > 100:
print(f"当前音频级别: {level:.0f}", end='\r')
except KeyboardInterrupt:
print("\n👋 退出录音测试")
finally:
recorder.cleanup()
if __name__ == "__main__":
main()