Compare commits
13 Commits
ef39e31a4b
...
df9616b48a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
df9616b48a | ||
|
|
072bb0e6b0 | ||
|
|
e6341b8620 | ||
|
|
48b99384b7 | ||
|
|
c01e6ad1f6 | ||
|
|
918bfb24af | ||
|
|
939a1721d6 | ||
|
|
12c79a5a53 | ||
|
|
b526328fe6 | ||
|
|
2612ef5b46 | ||
|
|
b87be1494d | ||
|
|
70c42eca15 | ||
|
|
eb099d827d |
127
recognition_example.py
Normal file
127
recognition_example.py
Normal file
@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
语音识别使用示例
|
||||
演示如何使用 speech_recognizer 模块
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
from speech_recognizer import SpeechRecognizer
|
||||
|
||||
async def example_recognize_file():
|
||||
"""示例:识别单个音频文件"""
|
||||
print("=== 示例1:识别单个音频文件 ===")
|
||||
|
||||
# 初始化识别器
|
||||
recognizer = SpeechRecognizer(
|
||||
app_key="your_app_key", # 请替换为实际的app_key
|
||||
access_key="your_access_key" # 请替换为实际的access_key
|
||||
)
|
||||
|
||||
# 假设有一个录音文件
|
||||
audio_file = "recording_20240101_120000.wav"
|
||||
|
||||
if not os.path.exists(audio_file):
|
||||
print(f"音频文件不存在: {audio_file}")
|
||||
print("请先运行 enhanced_wake_and_record.py 录制一个音频文件")
|
||||
return
|
||||
|
||||
try:
|
||||
# 识别音频文件
|
||||
results = await recognizer.recognize_file(audio_file)
|
||||
|
||||
print(f"识别结果(共{len(results)}个):")
|
||||
for i, result in enumerate(results):
|
||||
print(f"结果 {i+1}:")
|
||||
print(f" 文本: {result.text}")
|
||||
print(f" 置信度: {result.confidence}")
|
||||
print(f" 最终结果: {result.is_final}")
|
||||
print("-" * 40)
|
||||
|
||||
except Exception as e:
|
||||
print(f"识别失败: {e}")
|
||||
|
||||
async def example_recognize_latest():
|
||||
"""示例:识别最新的录音文件"""
|
||||
print("\n=== 示例2:识别最新的录音文件 ===")
|
||||
|
||||
# 初始化识别器
|
||||
recognizer = SpeechRecognizer(
|
||||
app_key="your_app_key", # 请替换为实际的app_key
|
||||
access_key="your_access_key" # 请替换为实际的access_key
|
||||
)
|
||||
|
||||
try:
|
||||
# 识别最新的录音文件
|
||||
result = await recognizer.recognize_latest_recording()
|
||||
|
||||
if result:
|
||||
print("识别结果:")
|
||||
print(f" 文本: {result.text}")
|
||||
print(f" 置信度: {result.confidence}")
|
||||
print(f" 最终结果: {result.is_final}")
|
||||
else:
|
||||
print("未找到录音文件或识别失败")
|
||||
|
||||
except Exception as e:
|
||||
print(f"识别失败: {e}")
|
||||
|
||||
async def example_batch_recognition():
|
||||
"""示例:批量识别多个录音文件"""
|
||||
print("\n=== 示例3:批量识别录音文件 ===")
|
||||
|
||||
# 初始化识别器
|
||||
recognizer = SpeechRecognizer(
|
||||
app_key="your_app_key", # 请替换为实际的app_key
|
||||
access_key="your_access_key" # 请替换为实际的access_key
|
||||
)
|
||||
|
||||
# 获取所有录音文件
|
||||
recording_files = [f for f in os.listdir(".") if f.startswith('recording_') and f.endswith('.wav')]
|
||||
|
||||
if not recording_files:
|
||||
print("未找到录音文件")
|
||||
return
|
||||
|
||||
print(f"找到 {len(recording_files)} 个录音文件")
|
||||
|
||||
for filename in recording_files[:5]: # 只处理前5个文件
|
||||
print(f"\n处理文件: {filename}")
|
||||
try:
|
||||
results = await recognizer.recognize_file(filename)
|
||||
|
||||
if results:
|
||||
final_result = results[-1] # 取最后一个结果
|
||||
print(f"识别结果: {final_result.text}")
|
||||
else:
|
||||
print("识别失败")
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理失败: {e}")
|
||||
|
||||
# 添加延迟,避免请求过于频繁
|
||||
await asyncio.sleep(1)
|
||||
|
||||
async def main():
|
||||
"""主函数"""
|
||||
print("🚀 语音识别使用示例")
|
||||
print("=" * 50)
|
||||
|
||||
# 请先设置环境变量或在代码中填入实际的API密钥
|
||||
if not os.getenv("SAUC_APP_KEY") and "your_app_key" in "your_app_key":
|
||||
print("⚠️ 请先设置 SAUC_APP_KEY 和 SAUC_ACCESS_KEY 环境变量")
|
||||
print("或者在代码中填入实际的 app_key 和 access_key")
|
||||
print("示例:")
|
||||
print("export SAUC_APP_KEY='your_app_key'")
|
||||
print("export SAUC_ACCESS_KEY='your_access_key'")
|
||||
return
|
||||
|
||||
# 运行示例
|
||||
await example_recognize_file()
|
||||
await example_recognize_latest()
|
||||
await example_batch_recognition()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
580
recorder.py
Normal file
580
recorder.py
Normal file
@ -0,0 +1,580 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
基于能量检测的极简录音系统
|
||||
专门针对树莓派3B优化,完全移除Vosk识别依赖
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import threading
|
||||
import pyaudio
|
||||
import numpy as np
|
||||
import wave
|
||||
|
||||
class EnergyBasedRecorder:
|
||||
"""基于能量检测的录音系统"""
|
||||
|
||||
def __init__(self, energy_threshold=500, silence_threshold=1.5, min_recording_time=2.0, max_recording_time=30.0):
|
||||
# 音频参数 - 极简优化
|
||||
self.FORMAT = pyaudio.paInt16
|
||||
self.CHANNELS = 1
|
||||
self.RATE = 8000 # 8kHz采样率
|
||||
self.CHUNK_SIZE = 1024 # 适中块大小
|
||||
|
||||
# 能量检测参数
|
||||
self.energy_threshold = energy_threshold # 能量阈值,高于此值认为有声音
|
||||
self.silence_threshold = silence_threshold # 静音阈值,低于此值持续多久认为结束
|
||||
self.min_recording_time = min_recording_time # 最小录音时间
|
||||
self.max_recording_time = max_recording_time # 最大录音时间
|
||||
self.pre_record_duration = 2.0 # 预录音时长(秒)
|
||||
|
||||
# 状态变量
|
||||
self.audio = None
|
||||
self.stream = None
|
||||
self.running = False
|
||||
self.recording = False
|
||||
self.recorded_frames = []
|
||||
self.recording_start_time = None
|
||||
self.last_sound_time = None
|
||||
self.energy_history = []
|
||||
self.zcr_history = [] # ZCR历史
|
||||
self.max_energy_history = 50 # 最大能量历史记录
|
||||
|
||||
# 预录音缓冲区
|
||||
self.pre_record_buffer = [] # 预录音缓冲区
|
||||
self.pre_record_max_frames = int(self.pre_record_duration * self.RATE / self.CHUNK_SIZE) # 最大预录音帧数
|
||||
|
||||
# 播放状态
|
||||
self.is_playing = False # 是否正在播放
|
||||
|
||||
# 智能静音检测
|
||||
self.voice_activity_history = [] # 语音活动历史
|
||||
self.max_voice_history = 20 # 最大语音活动历史记录
|
||||
self.consecutive_silence_count = 0 # 连续静音计数
|
||||
self.silence_threshold_count = 15 # 连续静音次数阈值(约1.5秒)
|
||||
|
||||
# 智能ZCR静音检测
|
||||
self.max_zcr_history = 30 # 最大ZCR历史记录
|
||||
self.consecutive_low_zcr_count = 0 # 连续低ZCR计数
|
||||
self.low_zcr_threshold_count = 20 # 连续低ZCR次数阈值(约2秒)
|
||||
|
||||
# 性能监控
|
||||
self.frame_count = 0
|
||||
self.start_time = time.time()
|
||||
|
||||
self._setup_audio()
|
||||
|
||||
def _setup_audio(self):
|
||||
"""设置音频设备"""
|
||||
try:
|
||||
self.audio = pyaudio.PyAudio()
|
||||
self.stream = self.audio.open(
|
||||
format=self.FORMAT,
|
||||
channels=self.CHANNELS,
|
||||
rate=self.RATE,
|
||||
input=True,
|
||||
frames_per_buffer=self.CHUNK_SIZE
|
||||
)
|
||||
print("✅ 音频设备初始化成功")
|
||||
except Exception as e:
|
||||
print(f"❌ 音频设备初始化失败: {e}")
|
||||
|
||||
def calculate_energy(self, audio_data):
|
||||
"""计算音频能量"""
|
||||
if len(audio_data) == 0:
|
||||
return 0
|
||||
|
||||
# 将字节数据转换为numpy数组
|
||||
audio_array = np.frombuffer(audio_data, dtype=np.int16)
|
||||
|
||||
# 计算RMS能量
|
||||
rms = np.sqrt(np.mean(audio_array ** 2))
|
||||
|
||||
# 更新能量历史(只在非录音状态下更新,避免语音影响背景噪音计算)
|
||||
if not self.recording:
|
||||
self.energy_history.append(rms)
|
||||
if len(self.energy_history) > self.max_energy_history:
|
||||
self.energy_history.pop(0)
|
||||
|
||||
return rms
|
||||
|
||||
def calculate_peak_energy(self, audio_data):
|
||||
"""计算峰值能量(辅助判断)"""
|
||||
if len(audio_data) == 0:
|
||||
return 0
|
||||
|
||||
audio_array = np.frombuffer(audio_data, dtype=np.int16)
|
||||
peak_energy = np.max(np.abs(audio_array))
|
||||
|
||||
return peak_energy
|
||||
|
||||
def calculate_zero_crossing_rate(self, audio_data):
|
||||
"""计算零交叉率(主要语音检测方法)"""
|
||||
if len(audio_data) == 0:
|
||||
return 0
|
||||
|
||||
audio_array = np.frombuffer(audio_data, dtype=np.int16)
|
||||
|
||||
# 计算零交叉次数
|
||||
zero_crossings = np.sum(np.diff(np.sign(audio_array)) != 0)
|
||||
|
||||
# 归一化到采样率
|
||||
zcr = zero_crossings / len(audio_array) * self.RATE
|
||||
|
||||
# 更新ZCR历史
|
||||
self.zcr_history.append(zcr)
|
||||
if len(self.zcr_history) > self.max_zcr_history:
|
||||
self.zcr_history.pop(0)
|
||||
|
||||
return zcr
|
||||
|
||||
def is_voice_active_advanced(self, energy, zcr):
|
||||
"""仅使用ZCR进行语音活动检测"""
|
||||
# ZCR语音检测:提高到1200-6000 Hz之间,更好地区分语音和环境噪音
|
||||
# 说话时ZCR会比较稳定在这个范围内
|
||||
zcr_condition = 1200 < zcr < 6000
|
||||
|
||||
# 添加一些容错,避免短暂的ZCR波动导致误判
|
||||
return zcr_condition
|
||||
|
||||
|
||||
def is_voice_active(self, energy):
|
||||
"""已弃用 - 仅用于兼容性"""
|
||||
# 现在主要使用ZCR检测,这个方法保留但不再使用
|
||||
return False
|
||||
|
||||
def save_recording(self, audio_data, filename=None):
|
||||
"""保存录音"""
|
||||
if filename is None:
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"recording_{timestamp}.wav"
|
||||
|
||||
try:
|
||||
with wave.open(filename, 'wb') as wf:
|
||||
wf.setnchannels(self.CHANNELS)
|
||||
wf.setsampwidth(self.audio.get_sample_size(self.FORMAT))
|
||||
wf.setframerate(self.RATE)
|
||||
wf.writeframes(audio_data)
|
||||
|
||||
print(f"✅ 录音已保存: {filename}")
|
||||
return True, filename
|
||||
except Exception as e:
|
||||
print(f"❌ 保存录音失败: {e}")
|
||||
return False, None
|
||||
|
||||
def play_audio(self, filename):
|
||||
"""播放音频文件"""
|
||||
try:
|
||||
print("🔇 准备播放,完全停止音频输入")
|
||||
|
||||
# 立即停止当前录音并清空所有缓冲区
|
||||
if self.recording:
|
||||
self.recording = False
|
||||
self.recorded_frames = []
|
||||
self.recording_start_time = None
|
||||
self.last_sound_time = None
|
||||
|
||||
# 清空所有缓冲区
|
||||
self.pre_record_buffer = []
|
||||
self.energy_history = []
|
||||
self.zcr_history = []
|
||||
|
||||
# 完全关闭输入流
|
||||
if self.stream:
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
|
||||
# 设置播放状态
|
||||
self.is_playing = True
|
||||
|
||||
# 等待一小段时间确保音频设备完全停止输入
|
||||
time.sleep(0.5)
|
||||
|
||||
with wave.open(filename, 'rb') as wf:
|
||||
channels = wf.getnchannels()
|
||||
width = wf.getsampwidth()
|
||||
rate = wf.getframerate()
|
||||
total_frames = wf.getnframes()
|
||||
|
||||
# 分块读取音频数据
|
||||
chunk_size = 1024
|
||||
frames = []
|
||||
|
||||
for _ in range(0, total_frames, chunk_size):
|
||||
chunk = wf.readframes(chunk_size)
|
||||
if chunk:
|
||||
frames.append(chunk)
|
||||
else:
|
||||
break
|
||||
|
||||
# 创建播放流
|
||||
playback_stream = self.audio.open(
|
||||
format=self.audio.get_format_from_width(width),
|
||||
channels=channels,
|
||||
rate=rate,
|
||||
output=True
|
||||
)
|
||||
|
||||
print(f"🔊 开始播放: {filename}")
|
||||
print("🚫 音频输入已完全关闭")
|
||||
|
||||
# 分块播放音频
|
||||
for chunk in frames:
|
||||
playback_stream.write(chunk)
|
||||
|
||||
playback_stream.stop_stream()
|
||||
playback_stream.close()
|
||||
|
||||
print("✅ 播放完成")
|
||||
print("🔄 重新开启音频输入")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 播放失败: {e}")
|
||||
self.play_with_system_player(filename)
|
||||
finally:
|
||||
# 恢复播放状态
|
||||
self.is_playing = False
|
||||
|
||||
# 等待播放完全结束
|
||||
time.sleep(0.3)
|
||||
|
||||
# 重新开启输入流
|
||||
self._setup_audio()
|
||||
|
||||
# 重置所有状态
|
||||
self.energy_history = []
|
||||
self.zcr_history = []
|
||||
print("📡 音频输入已重新开启")
|
||||
|
||||
def play_with_system_player(self, filename):
|
||||
"""使用系统播放器播放音频"""
|
||||
try:
|
||||
import subprocess
|
||||
cmd = ['aplay', filename] # Linux系统
|
||||
print(f"🔊 使用系统播放器: {' '.join(cmd)}")
|
||||
print("🚫 系统播放器播放中,音频输入保持关闭")
|
||||
subprocess.run(cmd, check=True)
|
||||
print("✅ 播放完成")
|
||||
print("📡 音频输入已保持关闭状态")
|
||||
except Exception as e:
|
||||
print(f"❌ 系统播放器也失败: {e}")
|
||||
|
||||
def play_audio_safe(self, filename):
|
||||
"""安全的播放方式 - 使用系统播放器"""
|
||||
try:
|
||||
print("🔇 准备播放,完全停止音频输入")
|
||||
|
||||
# 立即停止当前录音并清空所有缓冲区
|
||||
if self.recording:
|
||||
self.recording = False
|
||||
self.recorded_frames = []
|
||||
self.recording_start_time = None
|
||||
self.last_sound_time = None
|
||||
|
||||
# 清空所有缓冲区
|
||||
self.pre_record_buffer = []
|
||||
self.energy_history = []
|
||||
self.zcr_history = []
|
||||
|
||||
# 完全关闭输入流
|
||||
if self.stream:
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
|
||||
# 设置播放状态
|
||||
self.is_playing = True
|
||||
|
||||
# 等待确保音频设备完全停止
|
||||
time.sleep(0.5)
|
||||
|
||||
print(f"🔊 开始播放: {filename}")
|
||||
print("🚫 使用系统播放器,音频输入已完全关闭")
|
||||
|
||||
# 使用系统播放器
|
||||
self.play_with_system_player(filename)
|
||||
|
||||
print("🔄 准备重新开启音频输入")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 播放失败: {e}")
|
||||
finally:
|
||||
# 恢复播放状态
|
||||
self.is_playing = False
|
||||
|
||||
# 等待播放完全结束
|
||||
time.sleep(0.5)
|
||||
|
||||
# 重新开启输入流
|
||||
self._setup_audio()
|
||||
|
||||
# 重置所有状态
|
||||
self.energy_history = []
|
||||
self.zcr_history = []
|
||||
print("📡 音频输入已重新开启")
|
||||
|
||||
def update_pre_record_buffer(self, audio_data):
|
||||
"""更新预录音缓冲区"""
|
||||
self.pre_record_buffer.append(audio_data)
|
||||
|
||||
# 保持缓冲区大小
|
||||
if len(self.pre_record_buffer) > self.pre_record_max_frames:
|
||||
self.pre_record_buffer.pop(0)
|
||||
|
||||
def start_recording(self):
|
||||
"""开始录音"""
|
||||
print("🎙️ 检测到声音,开始录音...")
|
||||
self.recording = True
|
||||
self.recorded_frames = []
|
||||
|
||||
# 将预录音缓冲区的内容添加到录音中
|
||||
self.recorded_frames.extend(self.pre_record_buffer)
|
||||
|
||||
# 清空预录音缓冲区
|
||||
self.pre_record_buffer = []
|
||||
|
||||
self.recording_start_time = time.time()
|
||||
self.last_sound_time = time.time()
|
||||
self.energy_history = []
|
||||
self.zcr_history = [] # 重置ZCR历史
|
||||
|
||||
# 重置ZCR相关计数器
|
||||
self.consecutive_low_zcr_count = 0
|
||||
self.consecutive_silence_count = 0
|
||||
self.voice_activity_history = []
|
||||
|
||||
def stop_recording(self):
|
||||
"""停止录音"""
|
||||
if len(self.recorded_frames) > 0:
|
||||
audio_data = b''.join(self.recorded_frames)
|
||||
duration = len(audio_data) / (self.RATE * 2) # 16位音频,每样本2字节
|
||||
|
||||
# 计算实际录音时长和预录音时长
|
||||
actual_duration = duration
|
||||
pre_record_duration = min(duration, self.pre_record_duration)
|
||||
|
||||
print(f"📝 录音完成,时长: {actual_duration:.2f}秒 (包含预录音 {pre_record_duration:.1f}秒)")
|
||||
|
||||
# 保存录音
|
||||
success, filename = self.save_recording(audio_data)
|
||||
|
||||
# 如果保存成功,播放录音
|
||||
if success and filename:
|
||||
print("=" * 50)
|
||||
print("🔊 播放刚才录制的音频...")
|
||||
# 优先使用系统播放器避免回声
|
||||
self.play_audio_safe(filename)
|
||||
print("=" * 50)
|
||||
|
||||
self.recording = False
|
||||
self.recorded_frames = []
|
||||
self.recording_start_time = None
|
||||
self.last_sound_time = None
|
||||
self.energy_history = []
|
||||
self.zcr_history = []
|
||||
|
||||
def monitor_performance(self):
|
||||
"""性能监控"""
|
||||
self.frame_count += 1
|
||||
if self.frame_count % 1000 == 0: # 每1000帧显示一次
|
||||
elapsed = time.time() - self.start_time
|
||||
fps = self.frame_count / elapsed
|
||||
avg_energy = self.get_average_energy()
|
||||
print(f"📊 性能: {fps:.1f} FPS | 平均能量: {avg_energy:.1f} | 阈值: {self.energy_threshold}")
|
||||
|
||||
def auto_adjust_threshold(self):
|
||||
"""自动调整能量阈值"""
|
||||
if len(self.energy_history) >= 20:
|
||||
# 基于历史能量的中位数和标准差调整阈值
|
||||
median_energy = np.median(self.energy_history)
|
||||
std_energy = np.std(self.energy_history)
|
||||
|
||||
# 设置阈值为中位数 + 2倍标准差
|
||||
new_threshold = max(300, median_energy + 2 * std_energy)
|
||||
|
||||
# 平滑调整阈值
|
||||
self.energy_threshold = 0.9 * self.energy_threshold + 0.1 * new_threshold
|
||||
|
||||
def run(self):
|
||||
"""运行录音系统"""
|
||||
if not self.stream:
|
||||
print("❌ 音频设备未初始化")
|
||||
return
|
||||
|
||||
self.running = True
|
||||
print("🎤 开始监听...")
|
||||
print(f"能量阈值: {self.energy_threshold} (已弃用)")
|
||||
print(f"静音阈值: {self.silence_threshold}秒")
|
||||
print("📖 使用说明:")
|
||||
print("- 检测到声音自动开始录音")
|
||||
print("- 持续静音3秒自动结束录音")
|
||||
print("- 最少录音2秒,最多30秒")
|
||||
print("- 录音完成后自动播放")
|
||||
print("- 按 Ctrl+C 退出")
|
||||
print("🎯 新增功能:")
|
||||
print("- 纯ZCR语音检测(移除能量检测)")
|
||||
print("- 零交叉率检测(区分语音和噪音)")
|
||||
print("- 实时显示ZCR状态")
|
||||
print("- 预录音功能(包含声音开始前2秒)")
|
||||
print("- 环形缓冲区防止丢失开头音频")
|
||||
print("🤖 纯ZCR静音检测:")
|
||||
print("- 连续低ZCR计数(20次=2秒)")
|
||||
print("- ZCR活动历史追踪")
|
||||
print("- 基于ZCR模式的静音验证")
|
||||
print("- 语音范围: 1200-6000 Hz (提高阈值)")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
while self.running:
|
||||
# 读取音频数据
|
||||
data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False)
|
||||
|
||||
if len(data) == 0:
|
||||
continue
|
||||
|
||||
# 如果正在播放,完全跳过音频处理
|
||||
if self.is_playing:
|
||||
# 显示播放状态
|
||||
status = "🔊 播放中... 跳过录音处理"
|
||||
print(f"\r{status}", end='', flush=True)
|
||||
time.sleep(0.1) # 播放时增加延迟减少CPU使用
|
||||
continue
|
||||
|
||||
# 计算能量和零交叉率
|
||||
energy = self.calculate_energy(data)
|
||||
zcr = self.calculate_zero_crossing_rate(data)
|
||||
peak_energy = self.calculate_peak_energy(data)
|
||||
|
||||
# 性能监控
|
||||
self.monitor_performance()
|
||||
|
||||
if self.recording:
|
||||
# 录音模式
|
||||
self.recorded_frames.append(data)
|
||||
recording_duration = time.time() - self.recording_start_time
|
||||
|
||||
# 基于ZCR的智能静音检测
|
||||
if self.is_voice_active_advanced(energy, zcr):
|
||||
self.last_sound_time = time.time()
|
||||
self.consecutive_low_zcr_count = 0 # 重置低ZCR计数
|
||||
self.consecutive_silence_count = 0 # 重置静音计数
|
||||
else:
|
||||
self.consecutive_low_zcr_count += 1 # 增加低ZCR计数
|
||||
self.consecutive_silence_count += 1 # 增加静音计数
|
||||
|
||||
# 更新ZCR活动历史(基于ZCR是否在语音范围内)
|
||||
self.voice_activity_history.append(1200 < zcr < 6000)
|
||||
if len(self.voice_activity_history) > self.max_voice_history:
|
||||
self.voice_activity_history.pop(0)
|
||||
|
||||
# 检查是否应该结束录音
|
||||
current_time = time.time()
|
||||
|
||||
# 纯ZCR静音检测
|
||||
should_stop = False
|
||||
stop_reason = ""
|
||||
|
||||
# 主要检测:连续低ZCR计数
|
||||
if self.consecutive_low_zcr_count >= self.low_zcr_threshold_count:
|
||||
# 进一步验证:检查最近的ZCR活动历史
|
||||
if len(self.voice_activity_history) >= 15:
|
||||
recent_voice_activity = sum(self.voice_activity_history[-15:])
|
||||
if recent_voice_activity <= 3: # 最近15个样本中最多3个有语音活动
|
||||
should_stop = True
|
||||
stop_reason = f"ZCR静音检测 ({self.consecutive_low_zcr_count}次连续低ZCR)"
|
||||
else:
|
||||
# 如果历史数据不足,使用基础检测
|
||||
should_stop = True
|
||||
stop_reason = f"基础ZCR静音检测 ({self.consecutive_low_zcr_count}次)"
|
||||
|
||||
# 备用检测:基于时间的静音检测
|
||||
if not should_stop and current_time - self.last_sound_time > self.silence_threshold:
|
||||
should_stop = True
|
||||
stop_reason = f"时间静音检测 ({self.silence_threshold}秒)"
|
||||
|
||||
# 执行停止录音
|
||||
if should_stop and recording_duration >= self.min_recording_time:
|
||||
print(f"\n🔇 {stop_reason},结束录音")
|
||||
self.stop_recording()
|
||||
|
||||
# 检查最大录音时间
|
||||
if recording_duration > self.max_recording_time:
|
||||
print(f"\n⏰ 达到最大录音时间 {self.max_recording_time}秒")
|
||||
self.stop_recording()
|
||||
|
||||
# 显示录音状态(仅ZCR相关信息)
|
||||
is_voice = self.is_voice_active_advanced(energy, zcr)
|
||||
zcr_progress = f"{self.consecutive_low_zcr_count}/{self.low_zcr_threshold_count}"
|
||||
recent_activity = sum(self.voice_activity_history[-5:]) if len(self.voice_activity_history) >= 5 else 0
|
||||
status = f"录音中... {recording_duration:.1f}s | ZCR: {zcr:.0f} | 语音: {is_voice} | 低ZCR计数: {zcr_progress} | 活动: {recent_activity}"
|
||||
print(f"\r{status}", end='', flush=True)
|
||||
|
||||
else:
|
||||
# 监听模式 - 更新预录音缓冲区
|
||||
self.update_pre_record_buffer(data)
|
||||
|
||||
# 使用高级检测
|
||||
if self.is_voice_active_advanced(energy, zcr):
|
||||
# 检测到声音,开始录音
|
||||
self.start_recording()
|
||||
else:
|
||||
# 显示监听状态(仅ZCR相关信息)
|
||||
is_voice = self.is_voice_active_advanced(energy, zcr)
|
||||
buffer_usage = len(self.pre_record_buffer) / self.pre_record_max_frames * 100
|
||||
status = f"监听中... ZCR: {zcr:.0f} | 语音: {is_voice} | 缓冲: {buffer_usage:.0f}%"
|
||||
print(f"\r{status}", end='', flush=True)
|
||||
|
||||
# 减少CPU使用
|
||||
time.sleep(0.01)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n👋 退出")
|
||||
except Exception as e:
|
||||
print(f"❌ 错误: {e}")
|
||||
finally:
|
||||
self.stop()
|
||||
|
||||
def stop(self):
|
||||
"""停止系统"""
|
||||
self.running = False
|
||||
if self.recording:
|
||||
self.stop_recording()
|
||||
|
||||
if self.stream:
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
|
||||
if self.audio:
|
||||
self.audio.terminate()
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
print("🚀 基于能量检测的极简录音系统")
|
||||
print("=" * 50)
|
||||
|
||||
# 创建录音系统
|
||||
recorder = EnergyBasedRecorder(
|
||||
energy_threshold=200, # 能量阈值(降低以提高灵敏度)
|
||||
silence_threshold=3.0, # 静音阈值(秒)- 改为3秒
|
||||
min_recording_time=2.0, # 最小录音时间
|
||||
max_recording_time=30.0 # 最大录音时间
|
||||
)
|
||||
|
||||
print("✅ 系统初始化成功")
|
||||
print("🎯 优化特点:")
|
||||
print(" - 完全移除Vosk识别依赖")
|
||||
print(" - 基于能量检测,极低CPU占用")
|
||||
print(" - 自动调整能量阈值")
|
||||
print(" - 实时性能监控")
|
||||
print(" - 预期延迟: <0.1秒")
|
||||
print("=" * 50)
|
||||
|
||||
# 开始运行
|
||||
recorder.run()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,3 +1,5 @@
|
||||
vosk>=0.3.44
|
||||
pyaudio>=0.2.11
|
||||
numpy>=1.19.0
|
||||
aiohttp>=3.8.0
|
||||
asyncio
|
||||
15
sauc_python/readme.md
Normal file
15
sauc_python/readme.md
Normal file
@ -0,0 +1,15 @@
|
||||
# README
|
||||
|
||||
**asr tob 相关client demo**
|
||||
|
||||
# Notice
|
||||
python version: python 3.x
|
||||
|
||||
替换代码中的key为真实数据:
|
||||
"app_key": "xxxxxxx",
|
||||
"access_key": "xxxxxxxxxxxxxxxx"
|
||||
使用示例:
|
||||
python3 sauc_websocket_demo.py --file /Users/bytedance/code/python/eng_ddc_itn.wav
|
||||
|
||||
|
||||
|
||||
523
sauc_python/sauc_websocket_demo.py
Normal file
523
sauc_python/sauc_websocket_demo.py
Normal file
@ -0,0 +1,523 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import json
|
||||
import struct
|
||||
import gzip
|
||||
import uuid
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from typing import Optional, List, Dict, Any, Tuple, AsyncGenerator
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler('run.log'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 常量定义
|
||||
DEFAULT_SAMPLE_RATE = 16000
|
||||
|
||||
class ProtocolVersion:
|
||||
V1 = 0b0001
|
||||
|
||||
class MessageType:
|
||||
CLIENT_FULL_REQUEST = 0b0001
|
||||
CLIENT_AUDIO_ONLY_REQUEST = 0b0010
|
||||
SERVER_FULL_RESPONSE = 0b1001
|
||||
SERVER_ERROR_RESPONSE = 0b1111
|
||||
|
||||
class MessageTypeSpecificFlags:
|
||||
NO_SEQUENCE = 0b0000
|
||||
POS_SEQUENCE = 0b0001
|
||||
NEG_SEQUENCE = 0b0010
|
||||
NEG_WITH_SEQUENCE = 0b0011
|
||||
|
||||
class SerializationType:
|
||||
NO_SERIALIZATION = 0b0000
|
||||
JSON = 0b0001
|
||||
|
||||
class CompressionType:
|
||||
GZIP = 0b0001
|
||||
|
||||
|
||||
class Config:
|
||||
def __init__(self):
|
||||
# 填入控制台获取的app id和access token
|
||||
self.auth = {
|
||||
"app_key": "xxxxxxx",
|
||||
"access_key": "xxxxxxxxxxxx"
|
||||
}
|
||||
|
||||
@property
|
||||
def app_key(self) -> str:
|
||||
return self.auth["app_key"]
|
||||
|
||||
@property
|
||||
def access_key(self) -> str:
|
||||
return self.auth["access_key"]
|
||||
|
||||
config = Config()
|
||||
|
||||
class CommonUtils:
|
||||
@staticmethod
|
||||
def gzip_compress(data: bytes) -> bytes:
|
||||
return gzip.compress(data)
|
||||
|
||||
@staticmethod
|
||||
def gzip_decompress(data: bytes) -> bytes:
|
||||
return gzip.decompress(data)
|
||||
|
||||
@staticmethod
|
||||
def judge_wav(data: bytes) -> bool:
|
||||
if len(data) < 44:
|
||||
return False
|
||||
return data[:4] == b'RIFF' and data[8:12] == b'WAVE'
|
||||
|
||||
@staticmethod
|
||||
def convert_wav_with_path(audio_path: str, sample_rate: int = DEFAULT_SAMPLE_RATE) -> bytes:
|
||||
try:
|
||||
cmd = [
|
||||
"ffmpeg", "-v", "quiet", "-y", "-i", audio_path,
|
||||
"-acodec", "pcm_s16le", "-ac", "1", "-ar", str(sample_rate),
|
||||
"-f", "wav", "-"
|
||||
]
|
||||
result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
|
||||
# 尝试删除原始文件
|
||||
try:
|
||||
os.remove(audio_path)
|
||||
except OSError as e:
|
||||
logger.warning(f"Failed to remove original file: {e}")
|
||||
|
||||
return result.stdout
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"FFmpeg conversion failed: {e.stderr.decode()}")
|
||||
raise RuntimeError(f"Audio conversion failed: {e.stderr.decode()}")
|
||||
|
||||
@staticmethod
|
||||
def read_wav_info(data: bytes) -> Tuple[int, int, int, int, bytes]:
|
||||
if len(data) < 44:
|
||||
raise ValueError("Invalid WAV file: too short")
|
||||
|
||||
# 解析WAV头
|
||||
chunk_id = data[:4]
|
||||
if chunk_id != b'RIFF':
|
||||
raise ValueError("Invalid WAV file: not RIFF format")
|
||||
|
||||
format_ = data[8:12]
|
||||
if format_ != b'WAVE':
|
||||
raise ValueError("Invalid WAV file: not WAVE format")
|
||||
|
||||
# 解析fmt子块
|
||||
audio_format = struct.unpack('<H', data[20:22])[0]
|
||||
num_channels = struct.unpack('<H', data[22:24])[0]
|
||||
sample_rate = struct.unpack('<I', data[24:28])[0]
|
||||
bits_per_sample = struct.unpack('<H', data[34:36])[0]
|
||||
|
||||
# 查找data子块
|
||||
pos = 36
|
||||
while pos < len(data) - 8:
|
||||
subchunk_id = data[pos:pos+4]
|
||||
subchunk_size = struct.unpack('<I', data[pos+4:pos+8])[0]
|
||||
if subchunk_id == b'data':
|
||||
wave_data = data[pos+8:pos+8+subchunk_size]
|
||||
return (
|
||||
num_channels,
|
||||
bits_per_sample // 8,
|
||||
sample_rate,
|
||||
subchunk_size // (num_channels * (bits_per_sample // 8)),
|
||||
wave_data
|
||||
)
|
||||
pos += 8 + subchunk_size
|
||||
|
||||
raise ValueError("Invalid WAV file: no data subchunk found")
|
||||
|
||||
class AsrRequestHeader:
|
||||
def __init__(self):
|
||||
self.message_type = MessageType.CLIENT_FULL_REQUEST
|
||||
self.message_type_specific_flags = MessageTypeSpecificFlags.POS_SEQUENCE
|
||||
self.serialization_type = SerializationType.JSON
|
||||
self.compression_type = CompressionType.GZIP
|
||||
self.reserved_data = bytes([0x00])
|
||||
|
||||
def with_message_type(self, message_type: int) -> 'AsrRequestHeader':
|
||||
self.message_type = message_type
|
||||
return self
|
||||
|
||||
def with_message_type_specific_flags(self, flags: int) -> 'AsrRequestHeader':
|
||||
self.message_type_specific_flags = flags
|
||||
return self
|
||||
|
||||
def with_serialization_type(self, serialization_type: int) -> 'AsrRequestHeader':
|
||||
self.serialization_type = serialization_type
|
||||
return self
|
||||
|
||||
def with_compression_type(self, compression_type: int) -> 'AsrRequestHeader':
|
||||
self.compression_type = compression_type
|
||||
return self
|
||||
|
||||
def with_reserved_data(self, reserved_data: bytes) -> 'AsrRequestHeader':
|
||||
self.reserved_data = reserved_data
|
||||
return self
|
||||
|
||||
def to_bytes(self) -> bytes:
|
||||
header = bytearray()
|
||||
header.append((ProtocolVersion.V1 << 4) | 1)
|
||||
header.append((self.message_type << 4) | self.message_type_specific_flags)
|
||||
header.append((self.serialization_type << 4) | self.compression_type)
|
||||
header.extend(self.reserved_data)
|
||||
return bytes(header)
|
||||
|
||||
@staticmethod
|
||||
def default_header() -> 'AsrRequestHeader':
|
||||
return AsrRequestHeader()
|
||||
|
||||
class RequestBuilder:
|
||||
@staticmethod
|
||||
def new_auth_headers() -> Dict[str, str]:
|
||||
reqid = str(uuid.uuid4())
|
||||
return {
|
||||
"X-Api-Resource-Id": "volc.bigasr.sauc.duration",
|
||||
"X-Api-Request-Id": reqid,
|
||||
"X-Api-Access-Key": config.access_key,
|
||||
"X-Api-App-Key": config.app_key
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def new_full_client_request(seq: int) -> bytes: # 添加seq参数
|
||||
header = AsrRequestHeader.default_header() \
|
||||
.with_message_type_specific_flags(MessageTypeSpecificFlags.POS_SEQUENCE)
|
||||
|
||||
payload = {
|
||||
"user": {
|
||||
"uid": "demo_uid"
|
||||
},
|
||||
"audio": {
|
||||
"format": "wav",
|
||||
"codec": "raw",
|
||||
"rate": 16000,
|
||||
"bits": 16,
|
||||
"channel": 1
|
||||
},
|
||||
"request": {
|
||||
"model_name": "bigmodel",
|
||||
"enable_itn": True,
|
||||
"enable_punc": True,
|
||||
"enable_ddc": True,
|
||||
"show_utterances": True,
|
||||
"enable_nonstream": False
|
||||
}
|
||||
}
|
||||
|
||||
payload_bytes = json.dumps(payload).encode('utf-8')
|
||||
compressed_payload = CommonUtils.gzip_compress(payload_bytes)
|
||||
payload_size = len(compressed_payload)
|
||||
|
||||
request = bytearray()
|
||||
request.extend(header.to_bytes())
|
||||
request.extend(struct.pack('>i', seq)) # 使用传入的seq
|
||||
request.extend(struct.pack('>I', payload_size))
|
||||
request.extend(compressed_payload)
|
||||
|
||||
return bytes(request)
|
||||
|
||||
@staticmethod
|
||||
def new_audio_only_request(seq: int, segment: bytes, is_last: bool = False) -> bytes:
|
||||
header = AsrRequestHeader.default_header()
|
||||
if is_last: # 最后一个包特殊处理
|
||||
header.with_message_type_specific_flags(MessageTypeSpecificFlags.NEG_WITH_SEQUENCE)
|
||||
seq = -seq # 设为负值
|
||||
else:
|
||||
header.with_message_type_specific_flags(MessageTypeSpecificFlags.POS_SEQUENCE)
|
||||
header.with_message_type(MessageType.CLIENT_AUDIO_ONLY_REQUEST)
|
||||
|
||||
request = bytearray()
|
||||
request.extend(header.to_bytes())
|
||||
request.extend(struct.pack('>i', seq))
|
||||
|
||||
compressed_segment = CommonUtils.gzip_compress(segment)
|
||||
request.extend(struct.pack('>I', len(compressed_segment)))
|
||||
request.extend(compressed_segment)
|
||||
|
||||
return bytes(request)
|
||||
|
||||
class AsrResponse:
|
||||
def __init__(self):
|
||||
self.code = 0
|
||||
self.event = 0
|
||||
self.is_last_package = False
|
||||
self.payload_sequence = 0
|
||||
self.payload_size = 0
|
||||
self.payload_msg = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"code": self.code,
|
||||
"event": self.event,
|
||||
"is_last_package": self.is_last_package,
|
||||
"payload_sequence": self.payload_sequence,
|
||||
"payload_size": self.payload_size,
|
||||
"payload_msg": self.payload_msg
|
||||
}
|
||||
|
||||
class ResponseParser:
|
||||
@staticmethod
|
||||
def parse_response(msg: bytes) -> AsrResponse:
|
||||
response = AsrResponse()
|
||||
|
||||
header_size = msg[0] & 0x0f
|
||||
message_type = msg[1] >> 4
|
||||
message_type_specific_flags = msg[1] & 0x0f
|
||||
serialization_method = msg[2] >> 4
|
||||
message_compression = msg[2] & 0x0f
|
||||
|
||||
payload = msg[header_size*4:]
|
||||
|
||||
# 解析message_type_specific_flags
|
||||
if message_type_specific_flags & 0x01:
|
||||
response.payload_sequence = struct.unpack('>i', payload[:4])[0]
|
||||
payload = payload[4:]
|
||||
if message_type_specific_flags & 0x02:
|
||||
response.is_last_package = True
|
||||
if message_type_specific_flags & 0x04:
|
||||
response.event = struct.unpack('>i', payload[:4])[0]
|
||||
payload = payload[4:]
|
||||
|
||||
# 解析message_type
|
||||
if message_type == MessageType.SERVER_FULL_RESPONSE:
|
||||
response.payload_size = struct.unpack('>I', payload[:4])[0]
|
||||
payload = payload[4:]
|
||||
elif message_type == MessageType.SERVER_ERROR_RESPONSE:
|
||||
response.code = struct.unpack('>i', payload[:4])[0]
|
||||
response.payload_size = struct.unpack('>I', payload[4:8])[0]
|
||||
payload = payload[8:]
|
||||
|
||||
if not payload:
|
||||
return response
|
||||
|
||||
# 解压缩
|
||||
if message_compression == CompressionType.GZIP:
|
||||
try:
|
||||
payload = CommonUtils.gzip_decompress(payload)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to decompress payload: {e}")
|
||||
return response
|
||||
|
||||
# 解析payload
|
||||
try:
|
||||
if serialization_method == SerializationType.JSON:
|
||||
response.payload_msg = json.loads(payload.decode('utf-8'))
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse payload: {e}")
|
||||
|
||||
return response
|
||||
|
||||
class AsrWsClient:
|
||||
def __init__(self, url: str, segment_duration: int = 200):
|
||||
self.seq = 1
|
||||
self.url = url
|
||||
self.segment_duration = segment_duration
|
||||
self.conn = None
|
||||
self.session = None # 添加session引用
|
||||
|
||||
async def __aenter__(self):
|
||||
self.session = aiohttp.ClientSession()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
if self.conn and not self.conn.closed:
|
||||
await self.conn.close()
|
||||
if self.session and not self.session.closed:
|
||||
await self.session.close()
|
||||
|
||||
async def read_audio_data(self, file_path: str) -> bytes:
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
content = f.read()
|
||||
|
||||
if not CommonUtils.judge_wav(content):
|
||||
logger.info("Converting audio to WAV format...")
|
||||
content = CommonUtils.convert_wav_with_path(file_path, DEFAULT_SAMPLE_RATE)
|
||||
|
||||
return content
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to read audio data: {e}")
|
||||
raise
|
||||
|
||||
def get_segment_size(self, content: bytes) -> int:
|
||||
try:
|
||||
channel_num, samp_width, frame_rate, _, _ = CommonUtils.read_wav_info(content)[:5]
|
||||
size_per_sec = channel_num * samp_width * frame_rate
|
||||
segment_size = size_per_sec * self.segment_duration // 1000
|
||||
return segment_size
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to calculate segment size: {e}")
|
||||
raise
|
||||
|
||||
async def create_connection(self) -> None:
|
||||
headers = RequestBuilder.new_auth_headers()
|
||||
try:
|
||||
self.conn = await self.session.ws_connect( # 使用self.session
|
||||
self.url,
|
||||
headers=headers
|
||||
)
|
||||
logger.info(f"Connected to {self.url}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to WebSocket: {e}")
|
||||
raise
|
||||
|
||||
async def send_full_client_request(self) -> None:
|
||||
request = RequestBuilder.new_full_client_request(self.seq)
|
||||
self.seq += 1 # 发送后递增
|
||||
try:
|
||||
await self.conn.send_bytes(request)
|
||||
logger.info(f"Sent full client request with seq: {self.seq-1}")
|
||||
|
||||
msg = await self.conn.receive()
|
||||
if msg.type == aiohttp.WSMsgType.BINARY:
|
||||
response = ResponseParser.parse_response(msg.data)
|
||||
logger.info(f"Received response: {response.to_dict()}")
|
||||
else:
|
||||
logger.error(f"Unexpected message type: {msg.type}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send full client request: {e}")
|
||||
raise
|
||||
|
||||
async def send_messages(self, segment_size: int, content: bytes) -> AsyncGenerator[None, None]:
|
||||
audio_segments = self.split_audio(content, segment_size)
|
||||
total_segments = len(audio_segments)
|
||||
|
||||
for i, segment in enumerate(audio_segments):
|
||||
is_last = (i == total_segments - 1)
|
||||
request = RequestBuilder.new_audio_only_request(
|
||||
self.seq,
|
||||
segment,
|
||||
is_last=is_last
|
||||
)
|
||||
await self.conn.send_bytes(request)
|
||||
logger.info(f"Sent audio segment with seq: {self.seq} (last: {is_last})")
|
||||
|
||||
if not is_last:
|
||||
self.seq += 1
|
||||
|
||||
await asyncio.sleep(self.segment_duration / 1000) # 逐个发送,间隔时间模拟实时流
|
||||
# 让出控制权,允许接受消息
|
||||
yield
|
||||
|
||||
async def recv_messages(self) -> AsyncGenerator[AsrResponse, None]:
|
||||
try:
|
||||
async for msg in self.conn:
|
||||
if msg.type == aiohttp.WSMsgType.BINARY:
|
||||
response = ResponseParser.parse_response(msg.data)
|
||||
yield response
|
||||
|
||||
if response.is_last_package or response.code != 0:
|
||||
break
|
||||
elif msg.type == aiohttp.WSMsgType.ERROR:
|
||||
logger.error(f"WebSocket error: {msg.data}")
|
||||
break
|
||||
elif msg.type == aiohttp.WSMsgType.CLOSED:
|
||||
logger.info("WebSocket connection closed")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error receiving messages: {e}")
|
||||
raise
|
||||
|
||||
async def start_audio_stream(self, segment_size: int, content: bytes) -> AsyncGenerator[AsrResponse, None]:
|
||||
async def sender():
|
||||
async for _ in self.send_messages(segment_size, content):
|
||||
pass
|
||||
|
||||
# 启动发送和接收任务
|
||||
sender_task = asyncio.create_task(sender())
|
||||
|
||||
try:
|
||||
async for response in self.recv_messages():
|
||||
yield response
|
||||
finally:
|
||||
sender_task.cancel()
|
||||
try:
|
||||
await sender_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def split_audio(data: bytes, segment_size: int) -> List[bytes]:
|
||||
if segment_size <= 0:
|
||||
return []
|
||||
|
||||
segments = []
|
||||
for i in range(0, len(data), segment_size):
|
||||
end = i + segment_size
|
||||
if end > len(data):
|
||||
end = len(data)
|
||||
segments.append(data[i:end])
|
||||
return segments
|
||||
|
||||
async def execute(self, file_path: str) -> AsyncGenerator[AsrResponse, None]:
|
||||
if not file_path:
|
||||
raise ValueError("File path is empty")
|
||||
|
||||
if not self.url:
|
||||
raise ValueError("URL is empty")
|
||||
|
||||
self.seq = 1
|
||||
|
||||
try:
|
||||
# 1. 读取音频文件
|
||||
content = await self.read_audio_data(file_path)
|
||||
|
||||
# 2. 计算分段大小
|
||||
segment_size = self.get_segment_size(content)
|
||||
|
||||
# 3. 创建WebSocket连接
|
||||
await self.create_connection()
|
||||
|
||||
# 4. 发送完整客户端请求
|
||||
await self.send_full_client_request()
|
||||
|
||||
# 5. 启动音频流处理
|
||||
async for response in self.start_audio_stream(segment_size, content):
|
||||
yield response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in ASR execution: {e}")
|
||||
raise
|
||||
finally:
|
||||
if self.conn:
|
||||
await self.conn.close()
|
||||
|
||||
async def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="ASR WebSocket Client")
|
||||
parser.add_argument("--file", type=str, required=True, help="Audio file path")
|
||||
|
||||
#wss://openspeech.bytedance.com/api/v3/sauc/bigmodel
|
||||
#wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async
|
||||
#wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream
|
||||
parser.add_argument("--url", type=str, default="wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream",
|
||||
help="WebSocket URL")
|
||||
parser.add_argument("--seg-duration", type=int, default=200,
|
||||
help="Audio duration(ms) per packet, default:200")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
async with AsrWsClient(args.url, args.seg_duration) as client: # 使用async with
|
||||
try:
|
||||
async for response in client.execute(args.file):
|
||||
logger.info(f"Received response: {json.dumps(response.to_dict(), indent=2, ensure_ascii=False)}")
|
||||
except Exception as e:
|
||||
logger.error(f"ASR processing failed: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
# 用法:
|
||||
# python3 sauc_websocket_demo.py --file /Users/bytedance/code/python/eng_ddc_itn.wav
|
||||
@ -1,403 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
简化的唤醒+录音测试
|
||||
专注于解决音频冲突问题
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import threading
|
||||
import pyaudio
|
||||
import json
|
||||
|
||||
# 添加当前目录到路径
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
try:
|
||||
from vosk import Model, KaldiRecognizer
|
||||
VOSK_AVAILABLE = True
|
||||
except ImportError:
|
||||
VOSK_AVAILABLE = False
|
||||
print("⚠️ Vosk 未安装,请运行: pip install vosk")
|
||||
|
||||
class SimpleWakeAndRecord:
|
||||
"""简化的唤醒+录音系统"""
|
||||
|
||||
def __init__(self, model_path="model", wake_words=["你好", "助手"]):
|
||||
self.model_path = model_path
|
||||
self.wake_words = wake_words
|
||||
self.model = None
|
||||
self.recognizer = None
|
||||
self.audio = None
|
||||
self.stream = None
|
||||
self.running = False
|
||||
|
||||
# 音频参数
|
||||
self.FORMAT = pyaudio.paInt16
|
||||
self.CHANNELS = 1
|
||||
self.RATE = 16000
|
||||
self.CHUNK_SIZE = 1024
|
||||
|
||||
# 录音相关
|
||||
self.recording = False
|
||||
self.recorded_frames = []
|
||||
self.last_text_time = None # 最后一次识别到文字的时间
|
||||
self.recording_start_time = None
|
||||
self.recording_recognizer = None # 录音时专用的识别器
|
||||
|
||||
# 阈值
|
||||
self.text_silence_threshold = 3.0 # 3秒没有识别到文字就结束
|
||||
self.min_recording_time = 2.0 # 最小录音时间
|
||||
self.max_recording_time = 30.0 # 最大录音时间
|
||||
|
||||
self._setup_model()
|
||||
self._setup_audio()
|
||||
|
||||
def _setup_model(self):
|
||||
"""设置 Vosk 模型"""
|
||||
if not VOSK_AVAILABLE:
|
||||
return
|
||||
|
||||
try:
|
||||
if not os.path.exists(self.model_path):
|
||||
print(f"模型路径不存在: {self.model_path}")
|
||||
return
|
||||
|
||||
self.model = Model(self.model_path)
|
||||
self.recognizer = KaldiRecognizer(self.model, self.RATE)
|
||||
self.recognizer.SetWords(True)
|
||||
|
||||
print(f"✅ Vosk 模型加载成功")
|
||||
|
||||
except Exception as e:
|
||||
print(f"模型初始化失败: {e}")
|
||||
|
||||
def _setup_audio(self):
|
||||
"""设置音频设备"""
|
||||
try:
|
||||
if self.audio is None:
|
||||
self.audio = pyaudio.PyAudio()
|
||||
|
||||
if self.stream is None:
|
||||
self.stream = self.audio.open(
|
||||
format=self.FORMAT,
|
||||
channels=self.CHANNELS,
|
||||
rate=self.RATE,
|
||||
input=True,
|
||||
frames_per_buffer=self.CHUNK_SIZE
|
||||
)
|
||||
|
||||
print("✅ 音频设备初始化成功")
|
||||
|
||||
except Exception as e:
|
||||
print(f"音频设备初始化失败: {e}")
|
||||
|
||||
def _calculate_energy(self, audio_data):
|
||||
"""计算音频能量"""
|
||||
if len(audio_data) == 0:
|
||||
return 0
|
||||
|
||||
import numpy as np
|
||||
audio_array = np.frombuffer(audio_data, dtype=np.int16)
|
||||
rms = np.sqrt(np.mean(audio_array ** 2))
|
||||
return rms
|
||||
|
||||
def _check_wake_word(self, text):
|
||||
"""检查是否包含唤醒词"""
|
||||
if not text or not self.wake_words:
|
||||
return False, None
|
||||
|
||||
text_lower = text.lower()
|
||||
for wake_word in self.wake_words:
|
||||
if wake_word.lower() in text_lower:
|
||||
return True, wake_word
|
||||
return False, None
|
||||
|
||||
def _save_recording(self, audio_data):
|
||||
"""保存录音"""
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"recording_{timestamp}.wav"
|
||||
|
||||
try:
|
||||
import wave
|
||||
with wave.open(filename, 'wb') as wf:
|
||||
wf.setnchannels(self.CHANNELS)
|
||||
wf.setsampwidth(self.audio.get_sample_size(self.FORMAT))
|
||||
wf.setframerate(self.RATE)
|
||||
wf.writeframes(audio_data)
|
||||
|
||||
print(f"✅ 录音已保存: {filename}")
|
||||
return True, filename
|
||||
except Exception as e:
|
||||
print(f"保存录音失败: {e}")
|
||||
return False, None
|
||||
|
||||
def _play_audio(self, filename):
|
||||
"""播放音频文件"""
|
||||
try:
|
||||
import wave
|
||||
|
||||
# 打开音频文件
|
||||
with wave.open(filename, 'rb') as wf:
|
||||
# 获取音频参数
|
||||
channels = wf.getnchannels()
|
||||
width = wf.getsampwidth()
|
||||
rate = wf.getframerate()
|
||||
total_frames = wf.getnframes()
|
||||
|
||||
# 分块读取音频数据,避免内存问题
|
||||
chunk_size = 1024
|
||||
frames = []
|
||||
|
||||
for _ in range(0, total_frames, chunk_size):
|
||||
chunk = wf.readframes(chunk_size)
|
||||
if chunk:
|
||||
frames.append(chunk)
|
||||
else:
|
||||
break
|
||||
|
||||
# 创建播放流
|
||||
playback_stream = self.audio.open(
|
||||
format=self.audio.get_format_from_width(width),
|
||||
channels=channels,
|
||||
rate=rate,
|
||||
output=True
|
||||
)
|
||||
|
||||
print(f"🔊 开始播放: {filename}")
|
||||
|
||||
# 分块播放音频
|
||||
for chunk in frames:
|
||||
playback_stream.write(chunk)
|
||||
|
||||
# 等待播放完成
|
||||
playback_stream.stop_stream()
|
||||
playback_stream.close()
|
||||
|
||||
print("✅ 播放完成")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 播放失败: {e}")
|
||||
# 如果pyaudio播放失败,尝试用系统命令播放
|
||||
self._play_with_system_player(filename)
|
||||
|
||||
def _play_with_system_player(self, filename):
|
||||
"""使用系统播放器播放音频"""
|
||||
try:
|
||||
import platform
|
||||
import subprocess
|
||||
|
||||
system = platform.system()
|
||||
|
||||
if system == 'Darwin': # macOS
|
||||
cmd = ['afplay', filename]
|
||||
elif system == 'Windows':
|
||||
cmd = ['start', '/min', filename]
|
||||
else: # Linux
|
||||
cmd = ['aplay', filename]
|
||||
|
||||
print(f"🔊 使用系统播放器: {' '.join(cmd)}")
|
||||
subprocess.run(cmd, check=True)
|
||||
print("✅ 播放完成")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 系统播放器也失败: {e}")
|
||||
print(f"💡 文件已保存,请手动播放: {filename}")
|
||||
|
||||
def _start_recording(self):
|
||||
"""开始录音"""
|
||||
print("🎙️ 开始录音,请说话...")
|
||||
self.recording = True
|
||||
self.recorded_frames = []
|
||||
self.last_text_time = None
|
||||
self.recording_start_time = time.time()
|
||||
|
||||
# 为录音创建一个新的识别器
|
||||
if self.model:
|
||||
self.recording_recognizer = KaldiRecognizer(self.model, self.RATE)
|
||||
self.recording_recognizer.SetWords(True)
|
||||
|
||||
def _stop_recording(self):
|
||||
"""停止录音"""
|
||||
if len(self.recorded_frames) > 0:
|
||||
audio_data = b''.join(self.recorded_frames)
|
||||
duration = len(audio_data) / (self.RATE * 2)
|
||||
print(f"📝 录音完成,时长: {duration:.2f}秒")
|
||||
|
||||
# 保存录音
|
||||
success, filename = self._save_recording(audio_data)
|
||||
|
||||
# 如果保存成功,播放录音
|
||||
if success and filename:
|
||||
print("=" * 50)
|
||||
print("🔊 播放刚才录制的音频...")
|
||||
self._play_audio(filename)
|
||||
print("=" * 50)
|
||||
|
||||
self.recording = False
|
||||
self.recorded_frames = []
|
||||
self.last_text_time = None
|
||||
self.recording_start_time = None
|
||||
self.recording_recognizer = None
|
||||
|
||||
def start(self):
|
||||
"""开始唤醒词检测和录音"""
|
||||
if not self.stream:
|
||||
print("❌ 音频设备未初始化")
|
||||
return
|
||||
|
||||
self.running = True
|
||||
print("🎤 开始监听...")
|
||||
print(f"唤醒词: {', '.join(self.wake_words)}")
|
||||
|
||||
try:
|
||||
while self.running:
|
||||
# 读取音频数据
|
||||
data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False)
|
||||
|
||||
if len(data) == 0:
|
||||
continue
|
||||
|
||||
if self.recording:
|
||||
# 录音模式
|
||||
self.recorded_frames.append(data)
|
||||
recording_duration = time.time() - self.recording_start_time
|
||||
|
||||
# 使用录音专用的识别器进行实时识别
|
||||
if self.recording_recognizer:
|
||||
if self.recording_recognizer.AcceptWaveform(data):
|
||||
# 获取最终识别结果
|
||||
result = json.loads(self.recording_recognizer.Result())
|
||||
text = result.get('text', '').strip()
|
||||
|
||||
if text:
|
||||
# 识别到文字,更新时间戳
|
||||
self.last_text_time = time.time()
|
||||
print(f"\n📝 识别: {text}")
|
||||
else:
|
||||
# 获取部分识别结果
|
||||
partial_result = json.loads(self.recording_recognizer.PartialResult())
|
||||
partial_text = partial_result.get('partial', '').strip()
|
||||
|
||||
if partial_text:
|
||||
# 更新时间戳(部分识别也算有声音)
|
||||
self.last_text_time = time.time()
|
||||
status = f"录音中... {recording_duration:.1f}s | {partial_text}"
|
||||
print(f"\r{status}", end='', flush=True)
|
||||
|
||||
# 检查是否需要结束录音
|
||||
current_time = time.time()
|
||||
|
||||
# 检查是否有文字识别超时
|
||||
if self.last_text_time is not None:
|
||||
text_silence_duration = current_time - self.last_text_time
|
||||
if text_silence_duration > self.text_silence_threshold and recording_duration >= self.min_recording_time:
|
||||
print(f"\n\n3秒没有识别到文字,结束录音")
|
||||
self._stop_recording()
|
||||
else:
|
||||
# 还没有识别到任何文字,检查是否超时
|
||||
if recording_duration > 5.0: # 如果5秒还没识别到任何文字,也结束
|
||||
print(f"\n\n5秒没有识别到文字,结束录音")
|
||||
self._stop_recording()
|
||||
|
||||
# 检查最大录音时间
|
||||
if recording_duration > self.max_recording_time:
|
||||
print(f"\n\n达到最大录音时间 {self.max_recording_time}s")
|
||||
self._stop_recording()
|
||||
|
||||
# 显示录音状态
|
||||
if self.last_text_time is None:
|
||||
status = f"等待语音输入... {recording_duration:.1f}s"
|
||||
print(f"\r{status}", end='', flush=True)
|
||||
|
||||
elif self.model and self.recognizer:
|
||||
# 唤醒词检测模式
|
||||
if self.recognizer.AcceptWaveform(data):
|
||||
result = json.loads(self.recognizer.Result())
|
||||
text = result.get('text', '').strip()
|
||||
|
||||
if text:
|
||||
print(f"识别: {text}")
|
||||
|
||||
# 检查唤醒词
|
||||
is_wake_word, detected_word = self._check_wake_word(text)
|
||||
if is_wake_word:
|
||||
print(f"🎯 检测到唤醒词: {detected_word}")
|
||||
self._start_recording()
|
||||
else:
|
||||
# 显示实时音频级别
|
||||
energy = self._calculate_energy(data)
|
||||
if energy > 50: # 只显示有意义的音频级别
|
||||
partial_result = json.loads(self.recognizer.PartialResult())
|
||||
partial_text = partial_result.get('partial', '')
|
||||
if partial_text:
|
||||
status = f"监听中... 能量: {energy:.0f} | {partial_text}"
|
||||
else:
|
||||
status = f"监听中... 能量: {energy:.0f}"
|
||||
print(status, end='\r')
|
||||
|
||||
time.sleep(0.01)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n👋 退出")
|
||||
except Exception as e:
|
||||
print(f"错误: {e}")
|
||||
finally:
|
||||
self.stop()
|
||||
|
||||
def stop(self):
|
||||
"""停止"""
|
||||
self.running = False
|
||||
if self.recording:
|
||||
self._stop_recording()
|
||||
|
||||
if self.stream:
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
|
||||
if self.audio:
|
||||
self.audio.terminate()
|
||||
self.audio = None
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
print("🚀 简化唤醒+录音测试")
|
||||
print("=" * 50)
|
||||
|
||||
# 检查模型
|
||||
model_dir = "model"
|
||||
if not os.path.exists(model_dir):
|
||||
print("⚠️ 未找到模型目录")
|
||||
print("请下载 Vosk 模型到 model 目录")
|
||||
return
|
||||
|
||||
# 创建系统
|
||||
system = SimpleWakeAndRecord(
|
||||
model_path=model_dir,
|
||||
wake_words=["你好", "助手", "小爱"]
|
||||
)
|
||||
|
||||
if not system.model:
|
||||
print("❌ 模型加载失败")
|
||||
return
|
||||
|
||||
print("✅ 系统初始化成功")
|
||||
print("📖 使用说明:")
|
||||
print("1. 说唤醒词开始录音")
|
||||
print("2. 基于语音识别判断,3秒没有识别到文字就结束")
|
||||
print("3. 最少录音2秒,最多30秒")
|
||||
print("4. 录音时实时显示识别结果")
|
||||
print("5. 录音文件自动保存")
|
||||
print("6. 录音完成后自动播放刚才录制的内容")
|
||||
print("7. 按 Ctrl+C 退出")
|
||||
print("=" * 50)
|
||||
|
||||
# 开始运行
|
||||
system.start()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
532
speech_recognizer.py
Normal file
532
speech_recognizer.py
Normal file
@ -0,0 +1,532 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
语音识别模块
|
||||
基于 SAUC API 为录音文件提供语音识别功能
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import struct
|
||||
import gzip
|
||||
import uuid
|
||||
from typing import Optional, List, Dict, Any, AsyncGenerator
|
||||
from dataclasses import dataclass
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 常量定义
|
||||
DEFAULT_SAMPLE_RATE = 16000
|
||||
|
||||
class ProtocolVersion:
|
||||
V1 = 0b0001
|
||||
|
||||
class MessageType:
|
||||
CLIENT_FULL_REQUEST = 0b0001
|
||||
CLIENT_AUDIO_ONLY_REQUEST = 0b0010
|
||||
SERVER_FULL_RESPONSE = 0b1001
|
||||
SERVER_ERROR_RESPONSE = 0b1111
|
||||
|
||||
class MessageTypeSpecificFlags:
|
||||
NO_SEQUENCE = 0b0000
|
||||
POS_SEQUENCE = 0b0001
|
||||
NEG_SEQUENCE = 0b0010
|
||||
NEG_WITH_SEQUENCE = 0b0011
|
||||
|
||||
class SerializationType:
|
||||
NO_SERIALIZATION = 0b0000
|
||||
JSON = 0b0001
|
||||
|
||||
class CompressionType:
|
||||
GZIP = 0b0001
|
||||
|
||||
@dataclass
|
||||
class RecognitionResult:
|
||||
"""语音识别结果"""
|
||||
text: str
|
||||
confidence: float
|
||||
is_final: bool
|
||||
start_time: Optional[float] = None
|
||||
end_time: Optional[float] = None
|
||||
|
||||
class AudioUtils:
|
||||
"""音频处理工具类"""
|
||||
|
||||
@staticmethod
|
||||
def gzip_compress(data: bytes) -> bytes:
|
||||
"""GZIP压缩"""
|
||||
return gzip.compress(data)
|
||||
|
||||
@staticmethod
|
||||
def gzip_decompress(data: bytes) -> bytes:
|
||||
"""GZIP解压缩"""
|
||||
return gzip.decompress(data)
|
||||
|
||||
@staticmethod
|
||||
def is_wav_file(data: bytes) -> bool:
|
||||
"""检查是否为WAV文件"""
|
||||
if len(data) < 44:
|
||||
return False
|
||||
return data[:4] == b'RIFF' and data[8:12] == b'WAVE'
|
||||
|
||||
@staticmethod
|
||||
def read_wav_info(data: bytes) -> tuple:
|
||||
"""读取WAV文件信息"""
|
||||
if len(data) < 44:
|
||||
raise ValueError("Invalid WAV file: too short")
|
||||
|
||||
# 解析WAV头
|
||||
chunk_id = data[:4]
|
||||
if chunk_id != b'RIFF':
|
||||
raise ValueError("Invalid WAV file: not RIFF format")
|
||||
|
||||
format_ = data[8:12]
|
||||
if format_ != b'WAVE':
|
||||
raise ValueError("Invalid WAV file: not WAVE format")
|
||||
|
||||
# 解析fmt子块
|
||||
audio_format = struct.unpack('<H', data[20:22])[0]
|
||||
num_channels = struct.unpack('<H', data[22:24])[0]
|
||||
sample_rate = struct.unpack('<I', data[24:28])[0]
|
||||
bits_per_sample = struct.unpack('<H', data[34:36])[0]
|
||||
|
||||
# 查找data子块
|
||||
pos = 36
|
||||
while pos < len(data) - 8:
|
||||
subchunk_id = data[pos:pos+4]
|
||||
subchunk_size = struct.unpack('<I', data[pos+4:pos+8])[0]
|
||||
if subchunk_id == b'data':
|
||||
wave_data = data[pos+8:pos+8+subchunk_size]
|
||||
return (
|
||||
num_channels,
|
||||
bits_per_sample // 8,
|
||||
sample_rate,
|
||||
subchunk_size // (num_channels * (bits_per_sample // 8)),
|
||||
wave_data
|
||||
)
|
||||
pos += 8 + subchunk_size
|
||||
|
||||
raise ValueError("Invalid WAV file: no data subchunk found")
|
||||
|
||||
class AsrConfig:
|
||||
"""ASR配置"""
|
||||
|
||||
def __init__(self, app_key: str = None, access_key: str = None):
|
||||
self.auth = {
|
||||
"app_key": app_key or os.getenv("SAUC_APP_KEY", "your_app_key"),
|
||||
"access_key": access_key or os.getenv("SAUC_ACCESS_KEY", "your_access_key")
|
||||
}
|
||||
|
||||
@property
|
||||
def app_key(self) -> str:
|
||||
return self.auth["app_key"]
|
||||
|
||||
@property
|
||||
def access_key(self) -> str:
|
||||
return self.auth["access_key"]
|
||||
|
||||
class AsrRequestHeader:
|
||||
"""ASR请求头"""
|
||||
|
||||
def __init__(self):
|
||||
self.message_type = MessageType.CLIENT_FULL_REQUEST
|
||||
self.message_type_specific_flags = MessageTypeSpecificFlags.POS_SEQUENCE
|
||||
self.serialization_type = SerializationType.JSON
|
||||
self.compression_type = CompressionType.GZIP
|
||||
self.reserved_data = bytes([0x00])
|
||||
|
||||
def with_message_type(self, message_type: int) -> 'AsrRequestHeader':
|
||||
self.message_type = message_type
|
||||
return self
|
||||
|
||||
def with_message_type_specific_flags(self, flags: int) -> 'AsrRequestHeader':
|
||||
self.message_type_specific_flags = flags
|
||||
return self
|
||||
|
||||
def with_serialization_type(self, serialization_type: int) -> 'AsrRequestHeader':
|
||||
self.serialization_type = serialization_type
|
||||
return self
|
||||
|
||||
def with_compression_type(self, compression_type: int) -> 'AsrRequestHeader':
|
||||
self.compression_type = compression_type
|
||||
return self
|
||||
|
||||
def with_reserved_data(self, reserved_data: bytes) -> 'AsrRequestHeader':
|
||||
self.reserved_data = reserved_data
|
||||
return self
|
||||
|
||||
def to_bytes(self) -> bytes:
|
||||
header = bytearray()
|
||||
header.append((ProtocolVersion.V1 << 4) | 1)
|
||||
header.append((self.message_type << 4) | self.message_type_specific_flags)
|
||||
header.append((self.serialization_type << 4) | self.compression_type)
|
||||
header.extend(self.reserved_data)
|
||||
return bytes(header)
|
||||
|
||||
@staticmethod
|
||||
def default_header() -> 'AsrRequestHeader':
|
||||
return AsrRequestHeader()
|
||||
|
||||
class RequestBuilder:
|
||||
"""请求构建器"""
|
||||
|
||||
@staticmethod
|
||||
def new_auth_headers(config: AsrConfig) -> Dict[str, str]:
|
||||
"""创建认证头"""
|
||||
reqid = str(uuid.uuid4())
|
||||
return {
|
||||
"X-Api-Resource-Id": "volc.bigasr.sauc.duration",
|
||||
"X-Api-Request-Id": reqid,
|
||||
"X-Api-Access-Key": config.access_key,
|
||||
"X-Api-App-Key": config.app_key
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def new_full_client_request(seq: int) -> bytes:
|
||||
"""创建完整客户端请求"""
|
||||
header = AsrRequestHeader.default_header() \
|
||||
.with_message_type_specific_flags(MessageTypeSpecificFlags.POS_SEQUENCE)
|
||||
|
||||
payload = {
|
||||
"user": {
|
||||
"uid": "local_voice_user"
|
||||
},
|
||||
"audio": {
|
||||
"format": "wav",
|
||||
"codec": "raw",
|
||||
"rate": 16000,
|
||||
"bits": 16,
|
||||
"channel": 1
|
||||
},
|
||||
"request": {
|
||||
"model_name": "bigmodel",
|
||||
"enable_itn": True,
|
||||
"enable_punc": True,
|
||||
"enable_ddc": True,
|
||||
"show_utterances": True,
|
||||
"enable_nonstream": False
|
||||
}
|
||||
}
|
||||
|
||||
payload_bytes = json.dumps(payload).encode('utf-8')
|
||||
compressed_payload = AudioUtils.gzip_compress(payload_bytes)
|
||||
payload_size = len(compressed_payload)
|
||||
|
||||
request = bytearray()
|
||||
request.extend(header.to_bytes())
|
||||
request.extend(struct.pack('>i', seq))
|
||||
request.extend(struct.pack('>U', payload_size))
|
||||
request.extend(compressed_payload)
|
||||
|
||||
return bytes(request)
|
||||
|
||||
@staticmethod
|
||||
def new_audio_only_request(seq: int, segment: bytes, is_last: bool = False) -> bytes:
|
||||
"""创建纯音频请求"""
|
||||
header = AsrRequestHeader.default_header()
|
||||
if is_last:
|
||||
header.with_message_type_specific_flags(MessageTypeSpecificFlags.NEG_WITH_SEQUENCE)
|
||||
seq = -seq
|
||||
else:
|
||||
header.with_message_type_specific_flags(MessageTypeSpecificFlags.POS_SEQUENCE)
|
||||
header.with_message_type(MessageType.CLIENT_AUDIO_ONLY_REQUEST)
|
||||
|
||||
request = bytearray()
|
||||
request.extend(header.to_bytes())
|
||||
request.extend(struct.pack('>i', seq))
|
||||
|
||||
compressed_segment = AudioUtils.gzip_compress(segment)
|
||||
request.extend(struct.pack('>U', len(compressed_segment)))
|
||||
request.extend(compressed_segment)
|
||||
|
||||
return bytes(request)
|
||||
|
||||
class AsrResponse:
|
||||
"""ASR响应"""
|
||||
|
||||
def __init__(self):
|
||||
self.code = 0
|
||||
self.event = 0
|
||||
self.is_last_package = False
|
||||
self.payload_sequence = 0
|
||||
self.payload_size = 0
|
||||
self.payload_msg = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"code": self.code,
|
||||
"event": self.event,
|
||||
"is_last_package": self.is_last_package,
|
||||
"payload_sequence": self.payload_sequence,
|
||||
"payload_size": self.payload_size,
|
||||
"payload_msg": self.payload_msg
|
||||
}
|
||||
|
||||
class ResponseParser:
|
||||
"""响应解析器"""
|
||||
|
||||
@staticmethod
|
||||
def parse_response(msg: bytes) -> AsrResponse:
|
||||
"""解析响应"""
|
||||
response = AsrResponse()
|
||||
|
||||
header_size = msg[0] & 0x0f
|
||||
message_type = msg[1] >> 4
|
||||
message_type_specific_flags = msg[1] & 0x0f
|
||||
serialization_method = msg[2] >> 4
|
||||
message_compression = msg[2] & 0x0f
|
||||
|
||||
payload = msg[header_size*4:]
|
||||
|
||||
# 解析message_type_specific_flags
|
||||
if message_type_specific_flags & 0x01:
|
||||
response.payload_sequence = struct.unpack('>i', payload[:4])[0]
|
||||
payload = payload[4:]
|
||||
if message_type_specific_flags & 0x02:
|
||||
response.is_last_package = True
|
||||
if message_type_specific_flags & 0x04:
|
||||
response.event = struct.unpack('>i', payload[:4])[0]
|
||||
payload = payload[4:]
|
||||
|
||||
# 解析message_type
|
||||
if message_type == MessageType.SERVER_FULL_RESPONSE:
|
||||
response.payload_size = struct.unpack('>U', payload[:4])[0]
|
||||
payload = payload[4:]
|
||||
elif message_type == MessageType.SERVER_ERROR_RESPONSE:
|
||||
response.code = struct.unpack('>i', payload[:4])[0]
|
||||
response.payload_size = struct.unpack('>U', payload[4:8])[0]
|
||||
payload = payload[8:]
|
||||
|
||||
if not payload:
|
||||
return response
|
||||
|
||||
# 解压缩
|
||||
if message_compression == CompressionType.GZIP:
|
||||
try:
|
||||
payload = AudioUtils.gzip_decompress(payload)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to decompress payload: {e}")
|
||||
return response
|
||||
|
||||
# 解析payload
|
||||
try:
|
||||
if serialization_method == SerializationType.JSON:
|
||||
response.payload_msg = json.loads(payload.decode('utf-8'))
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse payload: {e}")
|
||||
|
||||
return response
|
||||
|
||||
class SpeechRecognizer:
|
||||
"""语音识别器"""
|
||||
|
||||
def __init__(self, app_key: str = None, access_key: str = None,
|
||||
url: str = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream"):
|
||||
self.config = AsrConfig(app_key, access_key)
|
||||
self.url = url
|
||||
self.seq = 1
|
||||
|
||||
async def recognize_file(self, file_path: str) -> List[RecognitionResult]:
|
||||
"""识别音频文件"""
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"Audio file not found: {file_path}")
|
||||
|
||||
results = []
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# 读取音频文件
|
||||
with open(file_path, 'rb') as f:
|
||||
content = f.read()
|
||||
|
||||
if not AudioUtils.is_wav_file(content):
|
||||
raise ValueError("Audio file must be in WAV format")
|
||||
|
||||
# 获取音频信息
|
||||
try:
|
||||
_, _, sample_rate, _, audio_data = AudioUtils.read_wav_info(content)
|
||||
if sample_rate != DEFAULT_SAMPLE_RATE:
|
||||
logger.warning(f"Sample rate {sample_rate} != {DEFAULT_SAMPLE_RATE}, may affect recognition accuracy")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to read audio info: {e}")
|
||||
raise
|
||||
|
||||
# 计算分段大小 (200ms per segment)
|
||||
segment_size = 1 * 2 * DEFAULT_SAMPLE_RATE * 200 // 1000 # channel * bytes_per_sample * sample_rate * duration_ms / 1000
|
||||
|
||||
# 创建WebSocket连接
|
||||
headers = RequestBuilder.new_auth_headers(self.config)
|
||||
async with session.ws_connect(self.url, headers=headers) as ws:
|
||||
|
||||
# 发送完整客户端请求
|
||||
request = RequestBuilder.new_full_client_request(self.seq)
|
||||
self.seq += 1
|
||||
await ws.send_bytes(request)
|
||||
|
||||
# 接收初始响应
|
||||
msg = await ws.receive()
|
||||
if msg.type == aiohttp.WSMsgType.BINARY:
|
||||
response = ResponseParser.parse_response(msg.data)
|
||||
logger.info(f"Initial response: {response.to_dict()}")
|
||||
|
||||
# 分段发送音频数据
|
||||
audio_segments = self._split_audio(audio_data, segment_size)
|
||||
total_segments = len(audio_segments)
|
||||
|
||||
for i, segment in enumerate(audio_segments):
|
||||
is_last = (i == total_segments - 1)
|
||||
request = RequestBuilder.new_audio_only_request(
|
||||
self.seq,
|
||||
segment,
|
||||
is_last=is_last
|
||||
)
|
||||
await ws.send_bytes(request)
|
||||
logger.info(f"Sent audio segment {i+1}/{total_segments}")
|
||||
|
||||
if not is_last:
|
||||
self.seq += 1
|
||||
|
||||
# 短暂延迟模拟实时流
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# 接收识别结果
|
||||
final_text = ""
|
||||
while True:
|
||||
msg = await ws.receive()
|
||||
if msg.type == aiohttp.WSMsgType.BINARY:
|
||||
response = ResponseParser.parse_response(msg.data)
|
||||
|
||||
if response.payload_msg and 'text' in response.payload_msg:
|
||||
text = response.payload_msg['text']
|
||||
if text:
|
||||
final_text += text
|
||||
|
||||
result = RecognitionResult(
|
||||
text=text,
|
||||
confidence=0.9, # 默认置信度
|
||||
is_final=response.is_last_package
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
logger.info(f"Recognized: {text}")
|
||||
|
||||
if response.is_last_package or response.code != 0:
|
||||
break
|
||||
|
||||
elif msg.type == aiohttp.WSMsgType.ERROR:
|
||||
logger.error(f"WebSocket error: {msg.data}")
|
||||
break
|
||||
elif msg.type == aiohttp.WSMsgType.CLOSED:
|
||||
logger.info("WebSocket connection closed")
|
||||
break
|
||||
|
||||
# 如果没有获得最终结果,创建一个包含所有文本的结果
|
||||
if final_text and not any(r.is_final for r in results):
|
||||
final_result = RecognitionResult(
|
||||
text=final_text,
|
||||
confidence=0.9,
|
||||
is_final=True
|
||||
)
|
||||
results.append(final_result)
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Speech recognition failed: {e}")
|
||||
raise
|
||||
|
||||
def _split_audio(self, data: bytes, segment_size: int) -> List[bytes]:
|
||||
"""分割音频数据"""
|
||||
if segment_size <= 0:
|
||||
return []
|
||||
|
||||
segments = []
|
||||
for i in range(0, len(data), segment_size):
|
||||
end = i + segment_size
|
||||
if end > len(data):
|
||||
end = len(data)
|
||||
segments.append(data[i:end])
|
||||
return segments
|
||||
|
||||
async def recognize_latest_recording(self, directory: str = ".") -> Optional[RecognitionResult]:
|
||||
"""识别最新的录音文件"""
|
||||
# 查找最新的录音文件
|
||||
recording_files = [f for f in os.listdir(directory) if f.startswith('recording_') and f.endswith('.wav')]
|
||||
|
||||
if not recording_files:
|
||||
logger.warning("No recording files found")
|
||||
return None
|
||||
|
||||
# 按文件名排序(包含时间戳)
|
||||
recording_files.sort(reverse=True)
|
||||
latest_file = recording_files[0]
|
||||
latest_path = os.path.join(directory, latest_file)
|
||||
|
||||
logger.info(f"Recognizing latest recording: {latest_file}")
|
||||
|
||||
try:
|
||||
results = await self.recognize_file(latest_path)
|
||||
if results:
|
||||
# 返回最终的识别结果
|
||||
final_results = [r for r in results if r.is_final]
|
||||
if final_results:
|
||||
return final_results[-1]
|
||||
else:
|
||||
# 如果没有标记为final的结果,返回最后一个
|
||||
return results[-1]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to recognize latest recording: {e}")
|
||||
|
||||
return None
|
||||
|
||||
async def main():
|
||||
"""测试函数"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="语音识别测试")
|
||||
parser.add_argument("--file", type=str, help="音频文件路径")
|
||||
parser.add_argument("--latest", action="store_true", help="识别最新的录音文件")
|
||||
parser.add_argument("--app-key", type=str, help="SAUC App Key")
|
||||
parser.add_argument("--access-key", type=str, help="SAUC Access Key")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
recognizer = SpeechRecognizer(
|
||||
app_key=args.app_key,
|
||||
access_key=args.access_key
|
||||
)
|
||||
|
||||
try:
|
||||
if args.latest:
|
||||
result = await recognizer.recognize_latest_recording()
|
||||
if result:
|
||||
print(f"识别结果: {result.text}")
|
||||
print(f"置信度: {result.confidence}")
|
||||
print(f"最终结果: {result.is_final}")
|
||||
else:
|
||||
print("未能识别到语音内容")
|
||||
elif args.file:
|
||||
results = await recognizer.recognize_file(args.file)
|
||||
for i, result in enumerate(results):
|
||||
print(f"结果 {i+1}: {result.text}")
|
||||
print(f"置信度: {result.confidence}")
|
||||
print(f"最终结果: {result.is_final}")
|
||||
print("-" * 40)
|
||||
else:
|
||||
print("请指定 --file 或 --latest 参数")
|
||||
|
||||
except Exception as e:
|
||||
print(f"识别失败: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@ -1,344 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
语音录制模块
|
||||
基于pyaudio实现,支持语音活动检测(VAD)自动判断录音结束
|
||||
"""
|
||||
|
||||
import pyaudio
|
||||
import wave
|
||||
import numpy as np
|
||||
import time
|
||||
import os
|
||||
import threading
|
||||
from collections import deque
|
||||
|
||||
class VoiceRecorder:
|
||||
"""语音录制器,支持自动检测语音结束"""
|
||||
|
||||
def __init__(self,
|
||||
energy_threshold=500,
|
||||
silence_threshold=1.0,
|
||||
min_recording_time=0.5,
|
||||
max_recording_time=10.0,
|
||||
sample_rate=16000,
|
||||
chunk_size=1024,
|
||||
defer_audio_init=False):
|
||||
"""
|
||||
初始化录音器
|
||||
|
||||
Args:
|
||||
energy_threshold: 语音能量阈值
|
||||
silence_threshold: 静音持续时间阈值(秒)
|
||||
min_recording_time: 最小录音时间(秒)
|
||||
max_recording_time: 最大录音时间(秒)
|
||||
sample_rate: 采样率
|
||||
chunk_size: 音频块大小
|
||||
defer_audio_init: 是否延迟音频初始化
|
||||
"""
|
||||
self.energy_threshold = energy_threshold
|
||||
self.silence_threshold = silence_threshold
|
||||
self.min_recording_time = min_recording_time
|
||||
self.max_recording_time = max_recording_time
|
||||
self.sample_rate = sample_rate
|
||||
self.chunk_size = chunk_size
|
||||
self.defer_audio_init = defer_audio_init
|
||||
|
||||
# 音频参数
|
||||
self.FORMAT = pyaudio.paInt16
|
||||
self.CHANNELS = 1
|
||||
|
||||
# 状态变量
|
||||
self.audio = None
|
||||
self.stream = None
|
||||
self.recording = False
|
||||
self.recorded_frames = []
|
||||
|
||||
# 语音检测相关
|
||||
self.silence_start_time = None
|
||||
self.recording_start_time = None
|
||||
self.audio_buffer = deque(maxlen=int(sample_rate / chunk_size * 2)) # 2秒缓冲
|
||||
|
||||
# 回调函数
|
||||
self.on_recording_complete = None
|
||||
self.on_speech_detected = None
|
||||
|
||||
if not defer_audio_init:
|
||||
self._setup_audio()
|
||||
|
||||
def _setup_audio(self):
|
||||
"""设置音频设备"""
|
||||
try:
|
||||
self.audio = pyaudio.PyAudio()
|
||||
|
||||
# 获取默认输入设备信息
|
||||
device_info = self.audio.get_default_input_device_info()
|
||||
print(f"使用音频设备: {device_info['name']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"音频设备初始化失败: {e}")
|
||||
raise
|
||||
|
||||
def _calculate_energy(self, audio_data):
|
||||
"""计算音频能量"""
|
||||
if len(audio_data) == 0:
|
||||
return 0
|
||||
|
||||
# 转换为numpy数组
|
||||
audio_array = np.frombuffer(audio_data, dtype=np.int16)
|
||||
|
||||
# 计算RMS能量
|
||||
rms = np.sqrt(np.mean(audio_array ** 2))
|
||||
return rms
|
||||
|
||||
def _is_speech(self, audio_data):
|
||||
"""判断是否为语音"""
|
||||
energy = self._calculate_energy(audio_data)
|
||||
return energy > self.energy_threshold
|
||||
|
||||
def _open_stream(self):
|
||||
"""打开音频流"""
|
||||
if self.stream is not None:
|
||||
return
|
||||
|
||||
self.stream = self.audio.open(
|
||||
format=self.FORMAT,
|
||||
channels=self.CHANNELS,
|
||||
rate=self.sample_rate,
|
||||
input=True,
|
||||
frames_per_buffer=self.chunk_size
|
||||
)
|
||||
|
||||
def _close_stream(self):
|
||||
"""关闭音频流"""
|
||||
if self.stream:
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
|
||||
def start_listening(self):
|
||||
"""开始监听语音"""
|
||||
if self.recording:
|
||||
print("正在录音中...")
|
||||
return
|
||||
|
||||
self._open_stream()
|
||||
self.recording = True
|
||||
self.recorded_frames = []
|
||||
self.silence_start_time = None
|
||||
self.recording_start_time = None
|
||||
|
||||
print("开始监听语音...")
|
||||
|
||||
# 在新线程中录音
|
||||
recording_thread = threading.Thread(target=self._record_loop)
|
||||
recording_thread.daemon = True
|
||||
recording_thread.start()
|
||||
|
||||
def _record_loop(self):
|
||||
"""录音循环"""
|
||||
try:
|
||||
while self.recording:
|
||||
# 读取音频数据
|
||||
data = self.stream.read(self.chunk_size, exception_on_overflow=False)
|
||||
|
||||
if len(data) == 0:
|
||||
continue
|
||||
|
||||
# 计算能量
|
||||
energy = self._calculate_energy(data)
|
||||
|
||||
# 添加到缓冲区
|
||||
self.audio_buffer.append(data)
|
||||
|
||||
# 检测语音活动
|
||||
if energy > self.energy_threshold:
|
||||
# 检测到语音
|
||||
if self.recording_start_time is None:
|
||||
# 开始录音
|
||||
self.recording_start_time = time.time()
|
||||
self.silence_start_time = None
|
||||
self.recorded_frames = list(self.audio_buffer) # 包含之前的音频
|
||||
|
||||
print("🎤 检测到语音,开始录音...")
|
||||
|
||||
if self.on_speech_detected:
|
||||
self.on_speech_detected()
|
||||
|
||||
# 重置静音计时
|
||||
self.silence_start_time = None
|
||||
|
||||
# 录音
|
||||
self.recorded_frames.append(data)
|
||||
|
||||
elif self.recording_start_time is not None:
|
||||
# 之前有语音,现在检查是否静音
|
||||
if self.silence_start_time is None:
|
||||
self.silence_start_time = time.time()
|
||||
|
||||
# 继续录音
|
||||
self.recorded_frames.append(data)
|
||||
|
||||
# 检查是否静音超时
|
||||
silence_duration = time.time() - self.silence_start_time
|
||||
if silence_duration > self.silence_threshold:
|
||||
recording_duration = time.time() - self.recording_start_time
|
||||
|
||||
# 检查最小录音时间
|
||||
if recording_duration >= self.min_recording_time:
|
||||
print(f"静音 {silence_duration:.1f}s,结束录音")
|
||||
self.stop_recording()
|
||||
break
|
||||
else:
|
||||
print(f"录音时间太短 ({recording_duration:.1f}s),继续等待...")
|
||||
self.silence_start_time = time.time()
|
||||
|
||||
# 检查最大录音时间
|
||||
if self.recording_start_time is not None:
|
||||
recording_duration = time.time() - self.recording_start_time
|
||||
if recording_duration > self.max_recording_time:
|
||||
print(f"达到最大录音时间 {self.max_recording_time}s,结束录音")
|
||||
self.stop_recording()
|
||||
break
|
||||
|
||||
# 短暂休眠
|
||||
time.sleep(0.01)
|
||||
|
||||
except Exception as e:
|
||||
print(f"录音过程中发生错误: {e}")
|
||||
self.stop_recording()
|
||||
|
||||
def stop_recording(self):
|
||||
"""停止录音"""
|
||||
if not self.recording:
|
||||
return
|
||||
|
||||
self.recording = False
|
||||
self._close_stream()
|
||||
|
||||
if len(self.recorded_frames) > 0:
|
||||
# 保存录音
|
||||
audio_data = b''.join(self.recorded_frames)
|
||||
|
||||
print(f"录音完成,共 {len(self.recorded_frames)} 帧")
|
||||
print(f"录音时长: {len(audio_data) / (self.sample_rate * 2):.2f} 秒")
|
||||
|
||||
# 调用回调函数
|
||||
if self.on_recording_complete:
|
||||
self.on_recording_complete(audio_data)
|
||||
|
||||
# 重置状态
|
||||
self.recorded_frames = []
|
||||
self.silence_start_time = None
|
||||
self.recording_start_time = None
|
||||
|
||||
def save_audio(self, audio_data, filename):
|
||||
"""保存音频到文件"""
|
||||
try:
|
||||
with wave.open(filename, 'wb') as wf:
|
||||
wf.setnchannels(self.CHANNELS)
|
||||
wf.setsampwidth(self.audio.get_sample_size(self.FORMAT))
|
||||
wf.setframerate(self.sample_rate)
|
||||
wf.writeframes(audio_data)
|
||||
|
||||
print(f"音频已保存到: {filename}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"保存音频失败: {e}")
|
||||
return False
|
||||
|
||||
def set_recording_complete_callback(self, callback):
|
||||
"""设置录音完成回调函数"""
|
||||
self.on_recording_complete = callback
|
||||
|
||||
def set_speech_detected_callback(self, callback):
|
||||
"""设置语音检测回调函数"""
|
||||
self.on_speech_detected = callback
|
||||
|
||||
def adjust_sensitivity(self, energy_threshold=None, silence_threshold=None):
|
||||
"""调整灵敏度"""
|
||||
if energy_threshold is not None:
|
||||
self.energy_threshold = energy_threshold
|
||||
print(f"能量阈值调整为: {energy_threshold}")
|
||||
|
||||
if silence_threshold is not None:
|
||||
self.silence_threshold = silence_threshold
|
||||
print(f"静音阈值调整为: {silence_threshold}秒")
|
||||
|
||||
def get_audio_level(self):
|
||||
"""获取当前音频级别"""
|
||||
if len(self.audio_buffer) > 0:
|
||||
latest_data = self.audio_buffer[-1]
|
||||
return self._calculate_energy(latest_data)
|
||||
return 0
|
||||
|
||||
def cleanup(self):
|
||||
"""清理资源"""
|
||||
self.stop_recording()
|
||||
if self.audio:
|
||||
self.audio.terminate()
|
||||
self.audio = None
|
||||
|
||||
def main():
|
||||
"""测试录音功能"""
|
||||
print("🎙️ 语音录制测试")
|
||||
print("=" * 50)
|
||||
print("配置:")
|
||||
print("- 能量阈值: 500")
|
||||
print("- 静音阈值: 1.0秒")
|
||||
print("- 最小录音时间: 0.5秒")
|
||||
print("- 最大录音时间: 10秒")
|
||||
print("=" * 50)
|
||||
print("请说话测试录音功能...")
|
||||
print("按 Ctrl+C 退出")
|
||||
|
||||
def on_recording_complete(audio_data):
|
||||
"""录音完成回调"""
|
||||
# 保存录音文件
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"recording_{timestamp}.wav"
|
||||
|
||||
recorder.save_audio(audio_data, filename)
|
||||
print(f"✅ 录音文件已保存: {filename}")
|
||||
|
||||
# 显示录音信息
|
||||
duration = len(audio_data) / (recorder.sample_rate * 2)
|
||||
print(f"录音时长: {duration:.2f} 秒")
|
||||
|
||||
def on_speech_detected():
|
||||
"""检测到语音回调"""
|
||||
print("🔊 检测到语音活动...")
|
||||
|
||||
# 创建录音器
|
||||
recorder = VoiceRecorder(
|
||||
energy_threshold=500,
|
||||
silence_threshold=1.0,
|
||||
min_recording_time=0.5,
|
||||
max_recording_time=10.0
|
||||
)
|
||||
|
||||
# 设置回调
|
||||
recorder.set_recording_complete_callback(on_recording_complete)
|
||||
recorder.set_speech_detected_callback(on_speech_detected)
|
||||
|
||||
try:
|
||||
# 开始监听
|
||||
recorder.start_listening()
|
||||
|
||||
# 保持程序运行
|
||||
while True:
|
||||
time.sleep(0.1)
|
||||
|
||||
# 显示当前音频级别(可选)
|
||||
level = recorder.get_audio_level()
|
||||
if level > 100:
|
||||
print(f"当前音频级别: {level:.0f}", end='\r')
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n👋 退出录音测试")
|
||||
finally:
|
||||
recorder.cleanup()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user