修复语音检测算法:解决背景噪音和能量计算问题

- 修复能量历史更新:只在非录音状态更新背景噪音
- 提高ZCR阈值:1000-4000范围更适合语音检测
- 优化动态阈值:背景噪音+50%提高敏感性
- 添加峰值能量计算和调试信息显示
- 解决语音影响背景噪音计算的问题
This commit is contained in:
朱潮 2025-09-20 12:08:40 +08:00
parent c01e6ad1f6
commit 48b99384b7

View File

@ -81,13 +81,24 @@ class EnergyBasedRecorder:
# 计算RMS能量
rms = np.sqrt(np.mean(audio_array ** 2))
# 更新能量历史
# 更新能量历史(只在非录音状态下更新,避免语音影响背景噪音计算)
if not self.recording:
self.energy_history.append(rms)
if len(self.energy_history) > self.max_energy_history:
self.energy_history.pop(0)
return rms
def calculate_peak_energy(self, audio_data):
"""计算峰值能量(辅助判断)"""
if len(audio_data) == 0:
return 0
audio_array = np.frombuffer(audio_data, dtype=np.int16)
peak_energy = np.max(np.abs(audio_array))
return peak_energy
def calculate_zero_crossing_rate(self, audio_data):
"""计算零交叉率(辅助判断语音)"""
if len(audio_data) == 0:
@ -110,21 +121,21 @@ class EnergyBasedRecorder:
# 使用最近10个样本的中位数作为背景噪音
background_energy = np.median(self.energy_history[-10:])
# 动态阈值:背景噪音 + 25%比原来的500更敏感
dynamic_threshold = max(50, background_energy * 1.25)
# 动态阈值:背景噪音 + 50%(提高敏感性
dynamic_threshold = max(50, background_energy * 1.5)
# 能量条件
energy_condition = energy > dynamic_threshold
# 零交叉率条件:语音通常在500-5000 Hz之间
# 对于8kHz采样率ZCR通常在500-2000之间
zcr_condition = 500 < zcr < 3000
# 零交叉率条件:语音通常在1000-5000 Hz之间
# 对于8kHz采样率ZCR通常在1000-4000之间
zcr_condition = 1000 < zcr < 4000
# 同时满足能量和ZCR条件才认为是语音
return energy_condition and zcr_condition
else:
# 初始阶段使用固定阈值
return energy > 80 # 更低的初始阈值
return energy > 60 and zcr > 1000 # 更严格的初始条件
def get_average_energy(self):
"""获取平均能量水平"""
@ -332,6 +343,7 @@ class EnergyBasedRecorder:
# 计算能量和零交叉率
energy = self.calculate_energy(data)
zcr = self.calculate_zero_crossing_rate(data)
peak_energy = self.calculate_peak_energy(data)
# 性能监控
self.monitor_performance()
@ -359,9 +371,10 @@ class EnergyBasedRecorder:
print(f"\n⏰ 达到最大录音时间 {self.max_recording_time}")
self.stop_recording()
# 显示录音状态(包含预录音信息)
# 显示录音状态(包含调试信息)
bg_energy = np.median(self.energy_history[-10:]) if len(self.energy_history) >= 10 else 0
status = f"录音中... {recording_duration:.1f}s | 能量: {energy:.0f} | ZCR: {zcr:.0f} | 背景: {bg_energy:.0f}"
is_voice = self.is_voice_active_advanced(energy, zcr)
status = f"录音中... {recording_duration:.1f}s | RMS: {energy:.0f} | 峰值: {peak_energy:.0f} | ZCR: {zcr:.0f} | 语音: {is_voice}"
print(f"\r{status}", end='', flush=True)
else:
@ -373,11 +386,12 @@ class EnergyBasedRecorder:
# 检测到声音,开始录音
self.start_recording()
else:
# 显示监听状态(包含缓冲区信息)
# 显示监听状态(包含调试信息)
avg_energy = self.get_average_energy()
bg_energy = np.median(self.energy_history[-10:]) if len(self.energy_history) >= 10 else 0
buffer_usage = len(self.pre_record_buffer) / self.pre_record_max_frames * 100
status = f"监听中... 能量: {energy:.0f} | ZCR: {zcr:.0f} | 背景: {bg_energy:.0f} | 缓冲: {buffer_usage:.0f}%"
is_voice = self.is_voice_active_advanced(energy, zcr)
status = f"监听中... RMS: {energy:.0f} | 峰值: {peak_energy:.0f} | ZCR: {zcr:.0f} | 背景: {bg_energy:.0f} | 语音: {is_voice} | 缓冲: {buffer_usage:.0f}%"
print(f"\r{status}", end='', flush=True)
# 减少CPU使用