回声待处理

This commit is contained in:
朱潮 2025-09-21 03:00:11 +08:00
parent aed69e9c54
commit 9523176d60
9 changed files with 2087 additions and 499 deletions

View File

@ -0,0 +1,127 @@
# Audio Processes 改进总结
## 问题背景
- 原始问题TTS音频只播放3个字符就停止出现ALSA underrun错误
- 根本原因:音频缓冲区管理不当,播放策略过于保守
## 改进内容
### 1. 音频播放优化 (_play_audio 方法)
- **改进前**:保守的播放策略,需要缓冲区有足够数据才开始播放
- **改进后**
- 借鉴 recorder.py 的播放策略:只要有数据就播放
- 添加错误恢复机制,自动检测和恢复 ALSA underrun
- 优化缓冲区管理,减少延迟
### 2. TTS 工作线程模式
- **参考**: recorder.py 的 TTS 工作线程实现
- **实现功能**
- 独立的 TTS 工作线程处理音频生成
- 任务队列管理,避免阻塞主线程
- 统一的 TTS 请求接口 `process_tts_request()`
- 支持流式音频处理
### 3. 统一的音频播放队列
- **InputProcess 和 OutputProcess 都支持**
- TTS 工作线程
- 音频生成和播放队列
- 统一的错误处理和日志记录
### 4. 关键改进点
#### 音频播放策略
```python
# 改进前:保守策略
if len(self.playback_buffer) > 2: # 需要缓冲区有足够数据
# 开始播放
# 改进后:积极策略 + 错误恢复
audio_chunk = self.playback_buffer.pop(0)
if audio_chunk and len(audio_chunk) > 0:
try:
self.output_stream.write(audio_chunk)
# 统计信息
except Exception as e:
# ALSA underrun 错误恢复
if "underrun" in str(e).lower():
# 自动恢复音频流
```
#### TTS 工作线程
```python
def _tts_worker(self):
"""TTS工作线程 - 处理TTS任务队列"""
while self.tts_worker_running:
try:
task = self.tts_task_queue.get(timeout=1.0)
if task is None:
break
task_type, content = task
if task_type == "tts_sentence":
self._generate_tts_audio(content)
self.tts_task_queue.task_done()
except queue.Empty:
continue
except Exception as e:
self.logger.error(f"TTS工作线程错误: {e}")
```
#### 错误恢复机制
```python
# ALSA underrun 检测和恢复
if "underrun" in str(e).lower() or "alsa" in str(e).lower():
self.logger.info("检测到ALSA underrun尝试恢复音频流")
try:
if self.output_stream:
self.output_stream.stop_stream()
time.sleep(0.1)
self.output_stream.start_stream()
self.logger.info("音频流已恢复")
except Exception as recovery_e:
self.logger.error(f"恢复音频流失败: {recovery_e}")
self.playback_buffer.clear()
```
### 5. 性能优化
- 减少日志输出频率,提高性能
- 优化队列处理策略,使用适当的超时设置
- 动态调整休眠时间根据播放状态优化CPU使用
### 6. 测试和验证
- 创建了测试脚本 `test_audio_processes.py`
- 验证了语法正确性
- 可以测试 TTS 功能的完整性
## 使用方法
### 在控制系统中使用
```python
from audio_processes import InputProcess, OutputProcess
# 创建输入和输出进程
input_process = InputProcess(command_queue, event_queue)
output_process = OutputProcess(audio_queue)
# 处理TTS请求
output_process.process_tts_request("你好,这是测试语音")
```
### 独立测试
```bash
python test_audio_processes.py
```
## 预期效果
- 解决 ALSA underrun 错误
- 提高音频播放的流畅性
- 减少 TTS 处理的延迟
- 提供更稳定的音频处理能力
## 注意事项
1. 确保系统安装了必要的依赖:`requests`, `pyaudio`
2. 检查音频设备是否正常工作
3. 网络连接正常用于TTS服务
4. 适当调整音频参数以适应不同环境

287
asr_diagnostic.py Normal file
View File

@ -0,0 +1,287 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
语音识别诊断工具
用于测试和诊断语音识别功能的具体问题
"""
import asyncio
import json
import gzip
import uuid
import numpy as np
import wave
import os
from typing import Optional
class ASRDiagnostic:
"""ASR诊断工具"""
def __init__(self):
self.api_config = {
'asr': {
'appid': "8718217928",
'cluster': "volcano_tts",
'token': "ynJMX-5ix1FsJvswC9KTNlGUdubcchqc",
'ws_url': "wss://openspeech.bytedance.com/api/v2/asr"
}
}
def generate_asr_header(self, message_type=1, message_type_specific_flags=0):
"""生成ASR头部"""
PROTOCOL_VERSION = 0b0001
DEFAULT_HEADER_SIZE = 0b0001
JSON = 0b0001
GZIP = 0b0001
header = bytearray()
header.append((PROTOCOL_VERSION << 4) | DEFAULT_HEADER_SIZE)
header.append((message_type << 4) | message_type_specific_flags)
header.append((JSON << 4) | GZIP)
header.append(0x00) # reserved
return header
def parse_asr_response(self, res):
"""解析ASR响应"""
print(f"🔍 解析响应,原始大小: {len(res)} 字节")
if len(res) < 8:
print(f"❌ 响应太短,无法解析")
return {}
try:
message_type = res[1] >> 4
payload_size = int.from_bytes(res[4:8], "big", signed=False)
payload_msg = res[8:8+payload_size]
print(f"📋 消息类型: {message_type}, 载荷大小: {payload_size}")
if message_type == 0b1001: # SERVER_FULL_RESPONSE
try:
if payload_msg.startswith(b'{'):
result = json.loads(payload_msg.decode('utf-8'))
print(f"✅ 成功解析JSON响应")
return result
else:
print(f"❌ 响应不是JSON格式")
except Exception as e:
print(f"❌ JSON解析失败: {e}")
except Exception as e:
print(f"❌ 响应解析异常: {e}")
return {}
async def test_asr_with_audio_file(self, audio_file_path: str):
"""使用音频文件测试ASR"""
print(f"🎵 测试ASR - 音频文件: {audio_file_path}")
if not os.path.exists(audio_file_path):
print(f"❌ 音频文件不存在: {audio_file_path}")
return
try:
# 读取音频文件
with wave.open(audio_file_path, 'rb') as wf:
channels = wf.getnchannels()
width = wf.getsampwidth()
rate = wf.getframerate()
frames = wf.readframes(wf.getnframes())
print(f"📊 音频信息: 采样率={rate}Hz, 声道={channels}, 位深={width*8}bits")
print(f"📊 音频大小: {len(frames)} 字节")
# 如果是立体声,转换为单声道
if channels > 1:
audio_array = np.frombuffer(frames, dtype=np.int16)
audio_array = audio_array.reshape(-1, channels)
audio_array = np.mean(audio_array, axis=1).astype(np.int16)
frames = audio_array.tobytes()
print(f"🔄 已转换为单声道")
return await self._test_asr_connection(frames)
except Exception as e:
print(f"❌ 音频文件处理失败: {e}")
return None
async def test_asr_with_silence(self):
"""测试静音音频"""
print(f"🔇 测试ASR - 静音音频")
# 生成3秒的静音音频 (16kHz, 16bit, 单声道)
duration = 3 # 秒
sample_rate = 16000
silence_data = bytes(duration * sample_rate * 2) # 2 bytes per sample
return await self._test_asr_connection(silence_data)
async def test_asr_with_noise(self):
"""测试噪音音频"""
print(f"📢 测试ASR - 噪音音频")
# 生成3秒的随机噪音
duration = 3 # 秒
sample_rate = 16000
noise_data = np.random.randint(-32768, 32767, duration * sample_rate, dtype=np.int16)
noise_data = noise_data.tobytes()
return await self._test_asr_connection(noise_data)
async def _test_asr_connection(self, audio_data: bytes):
"""测试ASR连接"""
try:
import websockets
# 构建请求参数
reqid = str(uuid.uuid4())
request_params = {
'app': {
'appid': self.api_config['asr']['appid'],
'cluster': self.api_config['asr']['cluster'],
'token': self.api_config['asr']['token'],
},
'user': {
'uid': 'asr_diagnostic'
},
'request': {
'reqid': reqid,
'nbest': 1,
'workflow': 'audio_in,resample,partition,vad,fe,decode,itn,nlu_punctuate',
'show_language': False,
'show_utterances': False,
'result_type': 'full',
"sequence": 1
},
'audio': {
'format': 'wav',
'rate': 16000,
'language': 'zh-CN',
'bits': 16,
'channel': 1,
'codec': 'raw'
}
}
print(f"📋 ASR请求参数:")
print(f" - AppID: {request_params['app']['appid']}")
print(f" - Cluster: {request_params['app']['cluster']}")
print(f" - Token: {request_params['app']['token'][:20]}...")
print(f" - RequestID: {reqid}")
# 构建请求
payload_bytes = str.encode(json.dumps(request_params))
payload_bytes = gzip.compress(payload_bytes)
full_client_request = bytearray(self.generate_asr_header())
full_client_request.extend((len(payload_bytes)).to_bytes(4, 'big'))
full_client_request.extend(payload_bytes)
# 设置认证头
additional_headers = {'Authorization': 'Bearer; {}'.format(self.api_config['asr']['token'])}
print(f"📡 连接WebSocket...")
# 连接WebSocket
async with websockets.connect(
self.api_config['asr']['ws_url'],
additional_headers=additional_headers,
max_size=1000000000
) as ws:
print(f"✅ WebSocket连接成功")
# 发送请求
print(f"📤 发送ASR配置请求...")
await ws.send(full_client_request)
res = await ws.recv()
result = self.parse_asr_response(res)
print(f"📥 配置响应: {result}")
# 发送音频数据
chunk_size = int(1 * 2 * 16000 * 15000 / 1000) # 1秒 chunks
total_chunks = 0
for offset in range(0, len(audio_data), chunk_size):
chunk = audio_data[offset:offset + chunk_size]
last = (offset + chunk_size) >= len(audio_data)
payload_bytes = gzip.compress(chunk)
audio_only_request = bytearray(
self.generate_asr_header(
message_type=0b0010,
message_type_specific_flags=0b0010 if last else 0
)
)
audio_only_request.extend((len(payload_bytes)).to_bytes(4, 'big'))
audio_only_request.extend(payload_bytes)
await ws.send(audio_only_request)
res = await ws.recv()
result = self.parse_asr_response(res)
total_chunks += 1
if last:
print(f"📨 发送最后一块音频数据 (总计{total_chunks}块)")
# 获取最终结果
print(f"🎯 等待最终识别结果...")
if 'payload_msg' in result and 'result' in result['payload_msg']:
results = result['payload_msg']['result']
print(f"📝 ASR返回结果数量: {len(results)}")
if results:
text = results[0].get('text', '识别失败')
print(f"✅ 识别结果: {text}")
return text
else:
print(f"❌ ASR结果为空")
else:
print(f"❌ ASR响应格式异常: {result.keys()}")
print(f"📋 完整响应: {result}")
return None
except Exception as e:
print(f"❌ ASR连接异常: {e}")
import traceback
print(f"❌ 详细错误:\n{traceback.format_exc()}")
return None
async def run_diagnostic(self):
"""运行完整诊断"""
print("🔧 ASR诊断工具")
print("=" * 50)
# 1. 测试静音
print("\n1⃣ 测试静音识别...")
await self.test_asr_with_silence()
# 2. 测试噪音
print("\n2⃣ 测试噪音识别...")
await self.test_asr_with_noise()
# 3. 测试录音文件(如果存在)
recording_files = [f for f in os.listdir('.') if f.startswith('recording_') and f.endswith('.wav')]
if recording_files:
print(f"\n3⃣ 测试录音文件...")
for file in recording_files[:3]: # 最多测试3个文件
await self.test_asr_with_audio_file(file)
else:
print(f"\n3⃣ 跳过录音文件测试 (无录音文件)")
print(f"\n✅ 诊断完成")
def main():
"""主函数"""
diagnostic = ASRDiagnostic()
try:
asyncio.run(diagnostic.run_diagnostic())
except KeyboardInterrupt:
print(f"\n🛑 诊断被用户中断")
except Exception as e:
print(f"❌ 诊断工具异常: {e}")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@ -38,6 +38,7 @@ class ControlSystem:
self.input_command_queue = mp.Queue(maxsize=100) # 主进程 → 输入进程
self.input_event_queue = mp.Queue(maxsize=100) # 输入进程 → 主进程
self.output_audio_queue = mp.Queue(maxsize=1000) # 主进程 → 输出进程
self.output_event_queue = mp.Queue(maxsize=100) # 输出进程 → 主进程
# 进程
self.input_process = None
@ -214,7 +215,8 @@ class ControlSystem:
self.output_process = mp.Process(
target=OutputProcess(
self.output_audio_queue,
output_config
output_config,
self.output_event_queue # 传递事件队列
).run
)
@ -286,13 +288,8 @@ class ControlSystem:
def _handle_playing_state(self):
"""处理播放状态"""
# 检查播放是否完成
if self.output_audio_queue.qsize() == 0 and not self.playback_complete:
# 等待一小段时间确保播放完成
time.sleep(0.5)
if self.output_audio_queue.qsize() == 0:
self.playback_complete = True
self.stats['total_conversations'] += 1
# 现在主要由输出进程的播放完成事件驱动
pass
def _check_events(self):
"""检查进程事件"""
@ -307,6 +304,18 @@ class ControlSystem:
except queue.Empty:
pass
# 检查输出进程事件
try:
while True:
event = self.output_event_queue.get_nowait()
if event.event_type == 'playback_complete':
print("📡 主控制:收到播放完成事件")
self._handle_playback_complete(event)
except queue.Empty:
pass
def _handle_recording_complete(self, event: ProcessEvent):
"""处理录音完成事件"""
@ -327,6 +336,21 @@ class ControlSystem:
print(f"🎯 状态RECORDING → PROCESSING (时长: {event.metadata['duration']:.2f}s)")
def _handle_playback_complete(self, event: ProcessEvent):
"""处理播放完成事件"""
# 标记播放完成
self.playback_complete = True
# 更新统计
self.stats['total_conversations'] += 1
# 切换到空闲状态
self.state = RecordingState.IDLE
print(f"🎯 状态PLAYING → IDLE")
# 重新启用输入进程录音功能
self.input_command_queue.put(ControlCommand('enable_recording'))
def _process_audio_pipeline(self):
"""处理音频流水线STT + LLM + TTS"""
try:
@ -390,9 +414,17 @@ class ControlSystem:
def _speech_to_text(self, audio_data: bytes) -> Optional[str]:
"""语音转文字"""
try:
return asyncio.run(self._recognize_audio_async(audio_data))
print(f"🔍 开始语音识别,音频大小: {len(audio_data)} 字节")
result = asyncio.run(self._recognize_audio_async(audio_data))
if result:
print(f"✅ 语音识别成功: {result}")
else:
print(f"❌ 语音识别返回空结果")
return result
except Exception as e:
print(f"❌ 语音识别异常: {e}")
import traceback
print(f"❌ 详细错误信息:\n{traceback.format_exc()}")
return None
async def _recognize_audio_async(self, audio_data: bytes) -> Optional[str]:
@ -401,7 +433,57 @@ class ControlSystem:
return "语音识别功能已禁用"
try:
# 验证音频数据
print(f"🎵 音频数据验证:")
print(f" - 大小: {len(audio_data)} 字节")
print(f" - 是否为空: {len(audio_data) == 0}")
if len(audio_data) == 0:
print("❌ 音频数据为空")
return None
# 检查是否有WAV头部
has_wav_header = audio_data.startswith(b'RIFF')
print(f" - 有WAV头部: {has_wav_header}")
if has_wav_header:
# 解析WAV头部
print(f" - WAV格式可能需要提取PCM数据")
riff_size = int.from_bytes(audio_data[4:8], 'little')
wave_fmt = audio_data[8:12]
if wave_fmt == b'WAVE':
print(f" - WAVE格式正确")
# 查找fmt块
fmt_pos = audio_data.find(b'fmt ')
if fmt_pos > 0:
fmt_size = int.from_bytes(audio_data[fmt_pos+4:fmt_pos+8], 'little')
audio_format = int.from_bytes(audio_data[fmt_pos+8:fmt_pos+10], 'little')
channels = int.from_bytes(audio_data[fmt_pos+10:fmt_pos+12], 'little')
sample_rate = int.from_bytes(audio_data[fmt_pos+12:fmt_pos+16], 'little')
print(f" - 音频格式: {audio_format}")
print(f" - 声道数: {channels}")
print(f" - 采样率: {sample_rate}")
else:
print(f" - 纯PCM数据")
# 检查音频数据格式假设是16位PCM
if len(audio_data) % 2 != 0:
print(f"⚠️ 音频数据长度不是2的倍数: {len(audio_data)}")
# 计算音频时长
sample_rate = self.config['audio']['sample_rate']
channels = self.config['audio']['channels']
bytes_per_second = sample_rate * channels * 2 # 16位 = 2字节
duration = len(audio_data) / bytes_per_second
print(f" - 配置采样率: {sample_rate} Hz")
print(f" - 配置声道数: {channels}")
print(f" - 估算时长: {duration:.2f}")
if duration < 0.5:
print(f"⚠️ 音频时长过短: {duration:.2f}")
import websockets
print(f"🔗 连接WebSocket ASR服务: {self.api_config['asr']['ws_url']}")
# 生成ASR头部
def generate_asr_header(message_type=1, message_type_specific_flags=0):
@ -417,25 +499,56 @@ class ControlSystem:
header.append(0x00) # reserved
return header
# 解析ASR响应
# 解析ASR响应 - 基于recorder.py的工作实现
def parse_asr_response(res):
# 简化的响应解析
if len(res) < 8:
return {}
"""解析ASR响应"""
PROTOCOL_VERSION = res[0] >> 4
header_size = res[0] & 0x0f
message_type = res[1] >> 4
payload_size = int.from_bytes(res[4:8], "big", signed=False)
payload_msg = res[8:8+payload_size]
message_type_specific_flags = res[1] & 0x0f
serialization_method = res[2] >> 4
message_compression = res[2] & 0x0f
reserved = res[3]
header_extensions = res[4:header_size * 4]
payload = res[header_size * 4:]
result = {}
payload_msg = None
payload_size = 0
print(f"🔍 响应头信息: message_type={message_type}, compression={message_compression}, serialization={serialization_method}")
if message_type == 0b1001: # SERVER_FULL_RESPONSE
try:
if payload_msg.startswith(b'{'):
result = json.loads(payload_msg.decode('utf-8'))
return result
except:
pass
payload_size = int.from_bytes(payload[:4], "big", signed=True)
payload_msg = payload[4:]
print(f"📦 Full响应: payload_size={payload_size}")
elif message_type == 0b1011: # SERVER_ACK
seq = int.from_bytes(payload[:4], "big", signed=True)
result['seq'] = seq
if len(payload) >= 8:
payload_size = int.from_bytes(payload[4:8], "big", signed=False)
payload_msg = payload[8:]
print(f"📦 ACK响应: seq={seq}, payload_size={payload_size}")
elif message_type == 0b1111: # SERVER_ERROR_RESPONSE
code = int.from_bytes(payload[:4], "big", signed=False)
result['code'] = code
payload_size = int.from_bytes(payload[4:8], "big", signed=False)
payload_msg = payload[8:]
print(f"❌ 错误响应: code={code}")
return {}
if payload_msg is None:
return result
if message_compression == 0b0001: # GZIP
payload_msg = gzip.decompress(payload_msg)
print(f"📦 解压后大小: {len(payload_msg)} 字节")
if serialization_method == 0b0001: # JSON
payload_msg = json.loads(str(payload_msg, "utf-8"))
print(f"📋 解析后的JSON: {json.dumps(payload_msg, indent=2, ensure_ascii=False)}")
result['payload_msg'] = payload_msg
result['payload_size'] = payload_size
return result
# 构建请求参数
reqid = str(uuid.uuid4())
@ -458,7 +571,7 @@ class ControlSystem:
"sequence": 1
},
'audio': {
'format': 'wav',
'format': 'pcm',
'rate': self.config['audio']['sample_rate'],
'language': 'zh-CN',
'bits': 16,
@ -468,6 +581,14 @@ class ControlSystem:
}
# 构建请求
print(f"📋 ASR请求参数:")
print(f" - audio.format: {request_params['audio']['format']}")
print(f" - audio.rate: {request_params['audio']['rate']}")
print(f" - audio.channel: {request_params['audio']['channel']}")
print(f" - audio.bits: {request_params['audio']['bits']}")
print(f" - audio.codec: {request_params['audio']['codec']}")
print(f" - request.workflow: {request_params['request']['workflow']}")
payload_bytes = str.encode(json.dumps(request_params))
payload_bytes = gzip.compress(payload_bytes)
full_client_request = bytearray(generate_asr_header())
@ -478,43 +599,136 @@ class ControlSystem:
additional_headers = {'Authorization': 'Bearer; {}'.format(self.api_config['asr']['token'])}
# 连接WebSocket
print(f"📡 尝试连接WebSocket...")
print(f"🔗 WebSocket URL: {self.api_config['asr']['ws_url']}")
print(f"📋 Headers: {additional_headers}")
async with websockets.connect(
self.api_config['asr']['ws_url'],
additional_headers=additional_headers,
max_size=1000000000
max_size=1000000000,
ping_interval=20,
ping_timeout=60
) as ws:
print(f"✅ WebSocket连接成功")
# 发送请求
print(f"📤 发送ASR请求...")
print(f"📦 请求大小: {len(full_client_request)} 字节")
await ws.send(full_client_request)
res = await ws.recv()
print(f"📥 收到ASR响应大小: {len(res)} 字节")
result = parse_asr_response(res)
print(f"🔍 解析ASR响应: {result}")
# 发送音频数据
# 发送音频数据 - 基于recorder.py实现
chunk_size = int(self.config['audio']['channels'] * 2 *
self.config['audio']['sample_rate'] * 15000 / 1000)
print(f"🎵 开始发送音频数据:")
print(f" - 总大小: {len(audio_data)} 字节")
print(f" - 分块大小: {chunk_size} 字节")
print(f" - 预计分块数: {(len(audio_data) + chunk_size - 1) // chunk_size}")
total_chunks = (len(audio_data) + chunk_size - 1) // chunk_size
chunks_sent = 0
for offset in range(0, len(audio_data), chunk_size):
chunks_sent += 1
chunk = audio_data[offset:offset + chunk_size]
last = (offset + chunk_size) >= len(audio_data)
payload_bytes = gzip.compress(chunk)
audio_only_request = bytearray(
generate_asr_header(
message_type=0b0010,
message_type_specific_flags=0b0010 if last else 0
)
)
audio_only_request.extend((len(payload_bytes)).to_bytes(4, 'big'))
audio_only_request.extend(payload_bytes)
print(f"📦 发送第 {chunks_sent}/{total_chunks} 块:")
print(f" - 当前块大小: {len(chunk)} 字节")
print(f" - 偏移量: {offset}-{offset + len(chunk)}")
print(f" - 是否最后一块: {last}")
await ws.send(audio_only_request)
res = await ws.recv()
result = parse_asr_response(res)
try:
payload_bytes = gzip.compress(chunk)
audio_only_request = bytearray(
generate_asr_header(
message_type=0b0010,
message_type_specific_flags=0b0010 if last else 0
)
)
audio_only_request.extend((len(payload_bytes)).to_bytes(4, 'big'))
audio_only_request.extend(payload_bytes)
print(f" - 压缩后大小: {len(payload_bytes)} 字节")
print(f" - 总请求数据大小: {len(audio_only_request)} 字节")
await ws.send(audio_only_request)
print(f" ✅ 第 {chunks_sent} 块发送成功")
# 等待服务器响应
try:
res = await asyncio.wait_for(ws.recv(), timeout=30.0)
print(f" 📥 收到第 {chunks_sent} 块响应,大小: {len(res)} 字节")
result = parse_asr_response(res)
print(f" 🔍 第 {chunks_sent} 块响应解析: {result}")
except asyncio.TimeoutError:
print(f" ⏰ 第 {chunks_sent} 块响应超时")
raise Exception("音频块响应超时")
# 检查每个响应是否有错误
if 'code' in result:
print(f" 🔍 第 {chunks_sent} 块响应码: {result['code']}")
if result['code'] != 1000:
print(f" ❌ 第 {chunks_sent} 块数据发送失败: {result}")
return None
if 'payload_msg' in result and result['payload_msg'].get('code') != 1000:
print(f" ❌ 第 {chunks_sent} 块数据发送失败: {result['payload_msg']}")
return None
except Exception as chunk_error:
print(f" ❌ 第 {chunks_sent} 块发送异常: {chunk_error}")
raise chunk_error
if last:
print(f"📨 发送最后一块音频数据完成")
print(f"🎯 所有音频数据发送完成,共发送 {chunks_sent}")
# 获取最终结果
if 'payload_msg' in result and 'result' in result['payload_msg']:
results = result['payload_msg']['result']
if results:
return results[0].get('text', '识别失败')
# 检查最后一个响应中是否包含识别结果
print(f"🎯 检查最终识别结果...")
print(f"📋 最后一个响应: {result}")
if 'payload_msg' in result:
payload_msg = result['payload_msg']
print(f"📋 最终Payload结构: {list(payload_msg.keys()) if isinstance(payload_msg, dict) else type(payload_msg)}")
print(f"📋 最终Payload内容: {payload_msg}")
if isinstance(payload_msg, dict):
# 检查响应码
if 'code' in payload_msg:
code = payload_msg['code']
print(f"🔢 最终响应码: {code}")
if code == 1000:
print(f"✅ ASR识别成功")
else:
print(f"❌ ASR服务返回错误: {payload_msg.get('message', '未知错误')}")
return None
# 查找结果 - 与recorder.py保持一致
if 'result' in payload_msg:
results = payload_msg['result']
print(f"📝 找到结果字段 'result': {results}")
if isinstance(results, list) and results:
text = results[0].get('text', '识别失败')
print(f"✅ 提取识别文本: {text}")
return text
elif isinstance(results, str):
print(f"✅ 提取识别文本: {results}")
return results
else:
print(f"❌ 未找到result字段可用字段: {list(payload_msg.keys())}")
print(f"完整payload: {json.dumps(payload_msg, indent=2, ensure_ascii=False)}")
else:
print(f"❌ Payload不是字典类型: {type(payload_msg)}")
else:
print(f"❌ 响应中没有payload_msg字段")
print(f"可用字段: {list(result.keys())}")
if 'code' in result:
print(f"错误码: {result['code']}")
return None
@ -580,9 +794,12 @@ class ControlSystem:
try:
print("🎵 开始文本转语音")
print(f"📝 待转换文本: {text}")
# 发送元数据
self.output_audio_queue.put(f"METADATA:{text[:30]}...")
metadata_msg = f"METADATA:{text[:30]}..."
print(f"📦 发送元数据: {metadata_msg}")
self.output_audio_queue.put(metadata_msg)
# 构建请求头
headers = {
@ -614,6 +831,7 @@ class ControlSystem:
# 发送请求
session = requests.Session()
try:
print(f"🌐 发送TTS请求到: {self.api_config['tts']['url']}")
response = session.post(
self.api_config['tts']['url'],
headers=headers,
@ -625,16 +843,22 @@ class ControlSystem:
print(f"❌ TTS请求失败: {response.status_code}")
return False
print(f"✅ TTS请求成功开始接收音频流")
# 处理流式响应
total_audio_size = 0
chunk_count = 0
queue_size_before = self.output_audio_queue.qsize()
for chunk in response.iter_lines(decode_unicode=True):
if not chunk:
continue
print(f"🔍 原始TTS响应块 {chunk_count + 1}: {chunk[:100]}...")
try:
data = json.loads(chunk)
print(f"🔍 解析后的TTS块 {chunk_count + 1}: {data}")
if data.get("code", 0) == 0 and "data" in data and data["data"]:
chunk_audio = base64.b64decode(data["data"])
@ -642,24 +866,42 @@ class ControlSystem:
total_audio_size += audio_size
chunk_count += 1
# 检查队列状态
current_queue_size = self.output_audio_queue.qsize()
print(f"📦 发送音频块 {chunk_count}: {audio_size} 字节, 队列大小: {current_queue_size}")
# 发送到输出进程
self.output_audio_queue.put(chunk_audio)
# 检查是否发送成功
new_queue_size = self.output_audio_queue.qsize()
if new_queue_size == current_queue_size + 1:
print(f"✅ 音频块 {chunk_count} 发送成功")
else:
print(f"⚠️ 音频块 {chunk_count} 发送后队列大小异常: {current_queue_size} -> {new_queue_size}")
# 显示进度
if chunk_count % 10 == 0:
if chunk_count % 5 == 0: # 更频繁显示进度
progress = f"📥 TTS生成: {chunk_count} 块 | {total_audio_size / 1024:.1f} KB"
print(f"\r{progress}", end='', flush=True)
if data.get("code", 0) == 20000000:
elif data.get("code", 0) == 20000000:
print(f"🏁 收到TTS结束信号")
break
elif data.get("code", 0) > 0:
print(f"❌ TTS错误响应: {data}")
except json.JSONDecodeError:
except json.JSONDecodeError as e:
print(f"❌ JSON解析错误: {e}")
print(f"原始数据: {chunk}")
continue
print(f"\n✅ TTS音频生成完成: {chunk_count} 块, {total_audio_size / 1024:.1f} KB")
print(f"📊 队列大小变化: {queue_size_before} -> {self.output_audio_queue.qsize()}")
# 发送结束信号
self.output_audio_queue.put(None)
# 不再在这里发送结束信号,让输出进程自然播放完所有音频
print(f"📦 TTS音频数据已全部发送等待输出进程播放完成")
print(f"📊 音频队列当前大小: {self.output_audio_queue.qsize()}")
return chunk_count > 0
@ -669,6 +911,8 @@ class ControlSystem:
except Exception as e:
print(f"❌ 文本转语音失败: {e}")
import traceback
print(f"❌ 详细错误: {traceback.format_exc()}")
return False
def _display_status(self):

377
enhanced_voice_detector.py Normal file
View File

@ -0,0 +1,377 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
高级语音检测器
结合能量+ZCR双重检测的自适应语音检测算法
针对16000Hz采样率优化
"""
import numpy as np
import time
from collections import deque
from typing import Dict, Any, Optional
import pyaudio
class EnhancedVoiceDetector:
"""增强版语音检测器"""
def __init__(self, sample_rate=16000, chunk_size=1024):
self.sample_rate = sample_rate
self.chunk_size = chunk_size
# 历史数据窗口
self.energy_window = deque(maxlen=100)
self.zcr_window = deque(maxlen=100)
# 统计信息
self.energy_stats = {
'mean': 0, 'std': 0, 'min': float('inf'), 'max': 0,
'median': 0, 'q75': 0, 'q25': 0
}
self.zcr_stats = {
'mean': 0, 'std': 0, 'min': float('inf'), 'max': 0,
'median': 0, 'q75': 0, 'q25': 0
}
# 检测参数
self.calibration_mode = True
self.calibration_samples = 0
self.required_calibration = 100 # 需要100个样本来校准
# 自适应参数 - 调整为更敏感
self.energy_multiplier = 1.0 # 能量阈值倍数(降低)
self.zcr_std_multiplier = 1.0 # ZCR标准差倍数降低
self.min_energy_threshold = 80 # 最小能量阈值(降低)
self.consecutive_voice_threshold = 2 # 连续语音检测阈值(降低)
self.consecutive_silence_threshold = 15 # 连续静音检测阈值(增加)
# 状态跟踪
self.consecutive_voice_count = 0
self.consecutive_silence_count = 0
self.last_voice_time = 0
# 调试信息
self.debug_mode = True
self.voice_count = 0
self.total_samples = 0
self._last_voice_state = False
def calculate_energy(self, audio_data: bytes) -> float:
"""计算音频能量RMS"""
if len(audio_data) == 0:
return 0
audio_array = np.frombuffer(audio_data, dtype=np.int16)
# RMS能量计算
rms = np.sqrt(np.mean(audio_array.astype(float) ** 2))
return rms
def calculate_zcr(self, audio_data: bytes) -> float:
"""计算零交叉率"""
if len(audio_data) == 0:
return 0
audio_array = np.frombuffer(audio_data, dtype=np.int16)
zero_crossings = np.sum(np.diff(np.sign(audio_array)) != 0)
zcr = zero_crossings / len(audio_array) * self.sample_rate
return zcr
def update_statistics(self, energy: float, zcr: float):
"""更新统计信息"""
self.energy_window.append(energy)
self.zcr_window.append(zcr)
if len(self.energy_window) >= 20:
# 计算详细统计信息
energy_array = np.array(self.energy_window)
zcr_array = np.array(self.zcr_window)
# 基础统计
self.energy_stats['mean'] = np.mean(energy_array)
self.energy_stats['std'] = np.std(energy_array)
self.energy_stats['min'] = np.min(energy_array)
self.energy_stats['max'] = np.max(energy_array)
self.energy_stats['median'] = np.median(energy_array)
self.energy_stats['q25'] = np.percentile(energy_array, 25)
self.energy_stats['q75'] = np.percentile(energy_array, 75)
self.zcr_stats['mean'] = np.mean(zcr_array)
self.zcr_stats['std'] = np.std(zcr_array)
self.zcr_stats['min'] = np.min(zcr_array)
self.zcr_stats['max'] = np.max(zcr_array)
self.zcr_stats['median'] = np.median(zcr_array)
self.zcr_stats['q25'] = np.percentile(zcr_array, 25)
self.zcr_stats['q75'] = np.percentile(zcr_array, 75)
def get_adaptive_thresholds(self) -> Dict[str, float]:
"""获取自适应阈值"""
if len(self.energy_window) < 30:
# 使用更敏感的固定阈值
return {
'energy_threshold': 120,
'zcr_min': 2000,
'zcr_max': 13000
}
# 计算动态能量阈值 - 使用更合理的算法
# 基于中位数和标准差,但使用更保守的倍数
base_energy_threshold = (self.energy_stats['median'] +
self.energy_multiplier * self.energy_stats['std'])
# 使用四分位数来避免异常值影响
q75 = self.energy_stats['q75']
q25 = self.energy_stats['q25']
iqr = q75 - q25 # 四分位距
# 基于IQR的鲁棒阈值 - 更敏感
iqr_threshold = q75 + 0.5 * iqr
# 结合两种方法的阈值 - 使用更低的阈值
energy_threshold = max(self.min_energy_threshold,
min(base_energy_threshold * 0.7, iqr_threshold))
# 计算动态ZCR阈值
zcr_center = self.zcr_stats['median']
zcr_spread = self.zcr_std_multiplier * self.zcr_stats['std']
# 确保ZCR范围在合理区间内 - 更宽松
zcr_min = max(1500, min(3000, zcr_center - zcr_spread))
zcr_max = min(14000, max(6000, zcr_center + zcr_spread * 2.0))
# 确保最小范围
if zcr_max - zcr_min < 2000:
zcr_max = zcr_min + 2000
return {
'energy_threshold': energy_threshold,
'zcr_min': zcr_min,
'zcr_max': zcr_max
}
def is_voice_basic(self, energy: float, zcr: float) -> bool:
"""基础语音检测(单帧)"""
thresholds = self.get_adaptive_thresholds()
# 能量检测
energy_ok = energy > thresholds['energy_threshold']
# ZCR检测
zcr_ok = thresholds['zcr_min'] < zcr < thresholds['zcr_max']
# 双重条件
return energy_ok and zcr_ok
def is_voice_advanced(self, audio_data: bytes) -> Dict[str, Any]:
"""高级语音检测(带状态跟踪)"""
# 计算特征
energy = self.calculate_energy(audio_data)
zcr = self.calculate_zcr(audio_data)
# 更新统计
self.update_statistics(energy, zcr)
# 总样本计数
self.total_samples += 1
# 校准模式
if self.calibration_mode:
self.calibration_samples += 1
if self.calibration_samples >= self.required_calibration:
self.calibration_mode = False
if self.debug_mode:
print(f"\n🎯 校准完成!")
print(f" 能量统计: {self.energy_stats['median']:.0f}±{self.energy_stats['std']:.0f}")
print(f" ZCR统计: {self.zcr_stats['median']:.0f}±{self.zcr_stats['std']:.0f}")
return {
'is_voice': False,
'energy': energy,
'zcr': zcr,
'calibrating': True,
'calibration_progress': self.calibration_samples / self.required_calibration,
'confidence': 0.0
}
# 基础检测
is_voice_frame = self.is_voice_basic(energy, zcr)
# 状态机处理
if is_voice_frame:
self.consecutive_voice_count += 1
self.consecutive_silence_count = 0
self.last_voice_time = time.time()
else:
self.consecutive_silence_count += 1
if self.consecutive_silence_count >= self.consecutive_silence_threshold:
self.consecutive_voice_count = 0
# 最终决策(需要连续检测到语音)
final_voice_detected = self.consecutive_voice_count >= self.consecutive_voice_threshold
if final_voice_detected and not hasattr(self, '_last_voice_state') or not self._last_voice_state:
self.voice_count += 1
# 更新最后状态
self._last_voice_state = final_voice_detected
# 计算置信度
thresholds = self.get_adaptive_thresholds()
energy_confidence = min(1.0, energy / thresholds['energy_threshold'])
zcr_confidence = 1.0 if thresholds['zcr_min'] < zcr < thresholds['zcr_max'] else 0.0
confidence = (energy_confidence + zcr_confidence) / 2
return {
'is_voice': final_voice_detected,
'energy': energy,
'zcr': zcr,
'confidence': confidence,
'energy_threshold': thresholds['energy_threshold'],
'zcr_min': thresholds['zcr_min'],
'zcr_max': thresholds['zcr_max'],
'consecutive_voice_count': self.consecutive_voice_count,
'consecutive_silence_count': self.consecutive_silence_count,
'calibrating': False,
'voice_detection_rate': self.voice_count / self.total_samples if self.total_samples > 0 else 0
}
def get_debug_info(self) -> str:
"""获取调试信息"""
if self.calibration_mode:
return f"校准中: {self.calibration_samples}/{self.required_calibration}"
thresholds = self.get_adaptive_thresholds()
return (f"能量阈值: {thresholds['energy_threshold']:.0f} | "
f"ZCR范围: {thresholds['zcr_min']:.0f}-{thresholds['zcr_max']:.0f} | "
f"检测率: {self.voice_count}/{self.total_samples} ({self.voice_count/self.total_samples*100:.1f}%)")
def reset(self):
"""重置检测器状态"""
self.energy_window.clear()
self.zcr_window.clear()
self.calibration_mode = True
self.calibration_samples = 0
self.consecutive_voice_count = 0
self.consecutive_silence_count = 0
self.voice_count = 0
self.total_samples = 0
class VoiceDetectorTester:
"""语音检测器测试器"""
def __init__(self):
self.detector = EnhancedVoiceDetector()
def run_test(self, duration=30):
"""运行测试"""
print("🎙️ 增强版语音检测器测试")
print("=" * 50)
print("📊 检测算法: 能量+ZCR双重检测")
print("📈 采样率: 16000Hz")
print("🔄 自适应阈值: 启用")
print("⏱️ 测试时长: 30秒")
print("💡 请说话测试检测效果...")
print("🛑 按 Ctrl+C 提前结束")
print("=" * 50)
try:
# 初始化音频
audio = pyaudio.PyAudio()
stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=16000,
input=True,
frames_per_buffer=1024
)
start_time = time.time()
voice_segments = []
current_segment = None
while time.time() - start_time < duration:
# 读取音频数据
data = stream.read(1024, exception_on_overflow=False)
# 检测语音
result = self.detector.is_voice_advanced(data)
# 处理语音段
if result['is_voice']:
if current_segment is None:
current_segment = {
'start_time': time.time(),
'start_sample': self.detector.total_samples
}
else:
if current_segment is not None:
current_segment['end_time'] = time.time()
current_segment['end_sample'] = self.detector.total_samples
current_segment['duration'] = current_segment['end_time'] - current_segment['start_time']
voice_segments.append(current_segment)
current_segment = None
# 显示状态
if result['calibrating']:
progress = result['calibration_progress'] * 100
status = f"\r🔧 校准中: {progress:.0f}% | 能量: {result['energy']:.0f} | ZCR: {result['zcr']:.0f}"
else:
status_icon = "🎤" if result['is_voice'] else "🔇"
status_color = "\033[92m" if result['is_voice'] else "\033[90m"
reset_color = "\033[0m"
status = (f"{status_color}{status_icon} "
f"能量: {result['energy']:.0f}/{result['energy_threshold']:.0f} | "
f"ZCR: {result['zcr']:.0f} ({result['zcr_min']:.0f}-{result['zcr_max']:.0f}) | "
f"置信度: {result['confidence']:.2f} | "
f"连续: {result['consecutive_voice_count']}/{result['consecutive_silence_count']}{reset_color}")
print(f"\r{status}", end='', flush=True)
time.sleep(0.01)
# 结束当前段
if current_segment is not None:
current_segment['end_time'] = time.time()
current_segment['duration'] = current_segment['end_time'] - current_segment['start_time']
voice_segments.append(current_segment)
# 显示统计结果
print(f"\n\n📊 测试结果统计:")
print(f" 总检测时长: {duration}")
print(f" 检测到语音段: {len(voice_segments)}")
print(f" 总语音时长: {sum(s['duration'] for s in voice_segments):.1f}")
print(f" 语音占比: {sum(s['duration'] for s in voice_segments)/duration*100:.1f}%")
print(f" 平均置信度: {np.mean([r['confidence'] for r in [self.detector.is_voice_advanced(b'test') for _ in range(10)]]):.2f}")
if voice_segments:
print(f" 平均语音段时长: {np.mean([s['duration'] for s in voice_segments]):.1f}")
print(f" 最长语音段: {max(s['duration'] for s in voice_segments):.1f}")
print(f"\n🎯 检测器状态:")
print(f" {self.detector.get_debug_info()}")
except KeyboardInterrupt:
print(f"\n\n🛑 测试被用户中断")
except Exception as e:
print(f"\n\n❌ 测试出错: {e}")
finally:
try:
if 'stream' in locals():
stream.stop_stream()
stream.close()
if 'audio' in locals():
audio.terminate()
except:
pass
def main():
"""主函数"""
tester = VoiceDetectorTester()
tester.run_test()
if __name__ == "__main__":
main()

98
process_logger.py Normal file
View File

@ -0,0 +1,98 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
日志配置模块
为多进程录音系统提供日志记录功能
"""
import logging
import os
import sys
from datetime import datetime
from typing import Optional
def setup_process_logger(process_name: str, log_dir: str = "logs") -> logging.Logger:
"""
为进程设置日志记录器
Args:
process_name: 进程名称用于日志文件名
log_dir: 日志目录路径
Returns:
配置好的日志记录器
"""
# 创建日志目录
if not os.path.exists(log_dir):
os.makedirs(log_dir)
# 生成日志文件名(包含时间戳)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"{process_name}_{timestamp}.log")
# 创建日志记录器
logger = logging.getLogger(f"{process_name}_logger")
logger.setLevel(logging.DEBUG)
# 清除现有的处理器
logger.handlers.clear()
# 文件处理器(记录所有级别)
file_handler = logging.FileHandler(log_file, encoding='utf-8')
file_handler.setLevel(logging.DEBUG)
# 控制台处理器只记录INFO及以上级别
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)
# 创建格式化器
file_formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
console_formatter = logging.Formatter(
'%(asctime)s - %(levelname)s - %(message)s',
datefmt='%H:%M:%S'
)
# 设置格式化器
file_handler.setFormatter(file_formatter)
console_handler.setFormatter(console_formatter)
# 添加处理器
logger.addHandler(file_handler)
logger.addHandler(console_handler)
logger.info(f"日志系统初始化完成 - 进程: {process_name}")
logger.info(f"日志文件: {log_file}")
return logger
class ProcessLogger:
"""进程日志包装器"""
def __init__(self, process_name: str, log_dir: str = "logs"):
self.process_name = process_name
self.logger = setup_process_logger(process_name, log_dir)
def debug(self, message: str):
"""调试日志"""
self.logger.debug(f"[{self.process_name}] {message}")
def info(self, message: str):
"""信息日志"""
self.logger.info(f"[{self.process_name}] {message}")
def warning(self, message: str):
"""警告日志"""
self.logger.warning(f"[{self.process_name}] {message}")
def error(self, message: str):
"""错误日志"""
self.logger.error(f"[{self.process_name}] {message}")
def critical(self, message: str):
"""严重错误日志"""
self.logger.critical(f"[{self.process_name}] {message}")

View File

@ -1,123 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
快速测试脚本
用于验证多进程录音系统的基础功能
"""
import time
import multiprocessing as mp
from audio_processes import InputProcess, OutputProcess
def test_audio_processes():
"""测试音频进程类"""
print("🧪 测试音频进程类...")
# 创建测试队列
command_queue = mp.Queue()
event_queue = mp.Queue()
audio_queue = mp.Queue()
# 创建进程配置
config = {
'zcr_min': 3000,
'zcr_max': 10000,
'min_recording_time': 3.0,
'max_recording_time': 10.0, # 缩短测试时间
'silence_threshold': 3.0,
'pre_record_duration': 2.0,
'voice_activation_threshold': 5, # 降低阈值便于测试
'calibration_samples': 50, # 减少校准时间
'adaptive_threshold': True
}
# 创建输入进程
input_process = InputProcess(command_queue, event_queue, config)
# 创建输出进程
output_process = OutputProcess(audio_queue)
print("✅ 音频进程类创建成功")
# 测试配置加载
print("📋 测试配置:")
print(f" ZCR范围: {config['zcr_min']} - {config['zcr_max']}")
print(f" 校准样本数: {config['calibration_samples']}")
print(f" 语音激活阈值: {config['voice_activation_threshold']}")
return True
def test_dependencies():
"""测试依赖库"""
print("🔍 检查依赖库...")
dependencies = {
'numpy': False,
'pyaudio': False,
'requests': False,
'websockets': False
}
try:
import numpy
dependencies['numpy'] = True
print("✅ numpy")
except ImportError:
print("❌ numpy")
try:
import pyaudio
dependencies['pyaudio'] = True
print("✅ pyaudio")
except ImportError:
print("❌ pyaudio")
try:
import requests
dependencies['requests'] = True
print("✅ requests")
except ImportError:
print("❌ requests")
try:
import websockets
dependencies['websockets'] = True
print("✅ websockets")
except ImportError:
print("❌ websockets")
missing = [dep for dep, installed in dependencies.items() if not installed]
if missing:
print(f"❌ 缺少依赖: {', '.join(missing)}")
return False
else:
print("✅ 所有依赖都已安装")
return True
def main():
"""主测试函数"""
print("🚀 多进程录音系统快速测试")
print("=" * 50)
# 测试依赖
if not test_dependencies():
print("❌ 依赖检查失败")
return False
print()
# 测试音频进程
if not test_audio_processes():
print("❌ 音频进程测试失败")
return False
print()
print("✅ 所有测试通过!")
print("💡 现在可以运行主程序:")
print(" python multiprocess_recorder.py")
return True
if __name__ == "__main__":
main()

113
start_with_logging.py Normal file
View File

@ -0,0 +1,113 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
启动脚本示例
演示如何使用带日志记录的多进程录音系统
"""
import os
import sys
import argparse
from datetime import datetime
def ensure_logs_directory():
"""确保日志目录存在"""
log_dir = "logs"
if not os.path.exists(log_dir):
os.makedirs(log_dir)
print(f"✅ 创建日志目录: {log_dir}")
return log_dir
def cleanup_old_logs(log_dir="logs", max_files=10):
"""清理旧的日志文件"""
if not os.path.exists(log_dir):
return
log_files = []
for file in os.listdir(log_dir):
if file.endswith('.log'):
file_path = os.path.join(log_dir, file)
log_files.append((file_path, os.path.getmtime(file_path)))
# 按修改时间排序,删除最旧的文件
log_files.sort(key=lambda x: x[1])
while len(log_files) > max_files:
oldest_file = log_files[0][0]
try:
os.remove(oldest_file)
print(f"🗑️ 删除旧日志文件: {oldest_file}")
log_files.pop(0)
except Exception as e:
print(f"⚠️ 删除日志文件失败 {oldest_file}: {e}")
break
def main():
"""主函数"""
parser = argparse.ArgumentParser(
description='带日志记录的多进程录音系统启动器',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
使用示例:
python start_with_logging.py # 使用默认设置
python start_with_logging.py --clean-logs # 清理旧日志
python start_with_logging.py --log-dir my_logs # 指定日志目录
"""
)
parser.add_argument('--character', '-c', type=str, default='libai',
help='选择角色 (默认: libai)')
parser.add_argument('--log-dir', type=str, default='logs',
help='日志目录路径 (默认: logs)')
parser.add_argument('--clean-logs', action='store_true',
help='清理旧日志文件')
parser.add_argument('--max-log-files', type=int, default=10,
help='保留的最大日志文件数量 (默认: 10)')
parser.add_argument('--config', type=str,
help='配置文件路径')
parser.add_argument('--verbose', '-v', action='store_true',
help='详细输出')
args = parser.parse_args()
print("🚀 带日志记录的多进程录音系统")
print("=" * 60)
# 确保日志目录存在
log_dir = ensure_logs_directory()
# 清理旧日志文件
if args.clean_logs:
cleanup_old_logs(log_dir, args.max_log_files)
# 显示日志配置信息
print(f"📁 日志目录: {log_dir}")
print(f"🎭 角色: {args.character}")
print("=" * 60)
# 导入主模块并启动
try:
# 修改sys.argv以传递参数给主程序
sys.argv = ['multiprocess_recorder.py']
if args.character:
sys.argv.extend(['-c', args.character])
if args.config:
sys.argv.extend(['--config', args.config])
if args.verbose:
sys.argv.append('--verbose')
# 导入并运行主程序
import multiprocess_recorder
multiprocess_recorder.main()
except KeyboardInterrupt:
print("\n👋 用户中断")
except Exception as e:
print(f"❌ 启动失败: {e}")
if args.verbose:
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()

View File

@ -1,194 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
语音检测测试脚本
用于测试和调试ZCR语音检测功能
"""
import numpy as np
import time
import pyaudio
from audio_processes import InputProcess
import multiprocessing as mp
import queue
class VoiceDetectionTester:
"""语音检测测试器"""
def __init__(self):
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 16000
self.CHUNK_SIZE = 1024
# 测试参数
self.test_duration = 30 # 测试30秒
self.zcr_history = []
self.voice_count = 0
# 音频设备
self.audio = None
self.stream = None
def setup_audio(self):
"""设置音频设备"""
try:
self.audio = pyaudio.PyAudio()
self.stream = self.audio.open(
format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK_SIZE
)
print("✅ 音频设备初始化成功")
return True
except Exception as e:
print(f"❌ 音频设备初始化失败: {e}")
return False
def calculate_zcr(self, audio_data):
"""计算零交叉率"""
if len(audio_data) == 0:
return 0
audio_array = np.frombuffer(audio_data, dtype=np.int16)
zero_crossings = np.sum(np.diff(np.sign(audio_array)) != 0)
zcr = zero_crossings / len(audio_array) * self.RATE
return zcr
def test_detection(self):
"""测试语音检测"""
print("🎙️ 开始语音检测测试")
print("=" * 50)
# 环境校准阶段
print("🔍 第一阶段:环境噪音校准 (10秒)")
print("请保持安静,不要说话...")
calibration_samples = []
start_time = time.time()
try:
while time.time() - start_time < 10:
data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False)
if len(data) > 0:
zcr = self.calculate_zcr(data)
calibration_samples.append(zcr)
# 显示进度
progress = (time.time() - start_time) / 10 * 100
print(f"\r校准进度: {progress:.1f}%", end='', flush=True)
time.sleep(0.01)
print("\n✅ 环境校准完成")
# 计算统计数据
if calibration_samples:
avg_zcr = np.mean(calibration_samples)
std_zcr = np.std(calibration_samples)
min_zcr = min(calibration_samples)
max_zcr = max(calibration_samples)
print(f"📊 环境噪音统计:")
print(f" 平均ZCR: {avg_zcr:.0f}")
print(f" 标准差: {std_zcr:.0f}")
print(f" 最小值: {min_zcr:.0f}")
print(f" 最大值: {max_zcr:.0f}")
# 建议的检测阈值
suggested_min = max(2400, avg_zcr + 2 * std_zcr)
suggested_max = min(12000, avg_zcr + 6 * std_zcr)
print(f"\n🎯 建议的语音检测阈值:")
print(f" 最小阈值: {suggested_min:.0f}")
print(f" 最大阈值: {suggested_max:.0f}")
# 测试检测
print(f"\n🎙️ 第二阶段:语音检测测试 (20秒)")
print("现在请说话,测试语音检测...")
voice_threshold = suggested_min
silence_threshold = suggested_max
consecutive_voice = 0
voice_detected = False
test_start = time.time()
while time.time() - test_start < 20:
data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False)
if len(data) > 0:
zcr = self.calculate_zcr(data)
# 简单的语音检测
is_voice = voice_threshold < zcr < silence_threshold
if is_voice:
consecutive_voice += 1
if consecutive_voice >= 5 and not voice_detected:
voice_detected = True
self.voice_count += 1
print(f"\n🎤 检测到语音 #{self.voice_count}! ZCR: {zcr:.0f}")
else:
consecutive_voice = 0
if voice_detected:
voice_detected = False
print(f" 语音结束,持续时间: {time.time() - last_voice_time:.1f}")
if voice_detected:
last_voice_time = time.time()
# 实时显示ZCR值
status = "🎤" if voice_detected else "🔇"
print(f"\r{status} ZCR: {zcr:.0f} | 阈值: {voice_threshold:.0f}-{silence_threshold:.0f} | "
f"连续语音: {consecutive_voice}/5", end='', flush=True)
time.sleep(0.01)
print(f"\n\n✅ 测试完成!共检测到 {self.voice_count} 次语音")
except KeyboardInterrupt:
print("\n🛑 测试被用户中断")
except Exception as e:
print(f"\n❌ 测试过程中出错: {e}")
def cleanup(self):
"""清理资源"""
if self.stream:
try:
self.stream.stop_stream()
self.stream.close()
except:
pass
if self.audio:
try:
self.audio.terminate()
except:
pass
def run_test(self):
"""运行完整测试"""
print("🚀 语音检测测试工具")
print("=" * 60)
if not self.setup_audio():
print("❌ 无法初始化音频设备,测试终止")
return
try:
self.test_detection()
finally:
self.cleanup()
print("\n👋 测试结束")
def main():
"""主函数"""
tester = VoiceDetectionTester()
tester.run_test()
if __name__ == "__main__":
main()