Local-Voice/control_system.py
2025-09-21 03:00:11 +08:00

1018 lines
41 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
多进程音频控制系统
实现主控制进程和状态管理
"""
import multiprocessing as mp
import queue
import time
import threading
import requests
import json
import base64
import gzip
import uuid
import asyncio
import websockets
from typing import Optional, Dict, Any, List
from dataclasses import dataclass, asdict
from enum import Enum
import os
import sys
from audio_processes import (
InputProcess, OutputProcess,
RecordingState, ControlCommand, ProcessEvent
)
class ControlSystem:
"""主控制系统"""
def __init__(self, config: Dict[str, Any] = None):
self.config = config or self._get_default_config()
# 进程间通信
self.input_command_queue = mp.Queue(maxsize=100) # 主进程 → 输入进程
self.input_event_queue = mp.Queue(maxsize=100) # 输入进程 → 主进程
self.output_audio_queue = mp.Queue(maxsize=1000) # 主进程 → 输出进程
self.output_event_queue = mp.Queue(maxsize=100) # 输出进程 → 主进程
# 进程
self.input_process = None
self.output_process = None
# 状态管理
self.state = RecordingState.IDLE
self.processing_complete = False
self.playback_complete = False
# 当前处理的数据
self.current_audio_data = None
self.current_audio_metadata = None
# API配置
self.api_config = self._setup_api_config()
# 统计信息
self.stats = {
'total_conversations': 0,
'total_recording_time': 0,
'successful_processing': 0,
'failed_processing': 0
}
# 运行状态
self.running = True
# 检查依赖
self._check_dependencies()
def _get_default_config(self) -> Dict[str, Any]:
"""获取默认配置"""
return {
'system': {
'max_queue_size': 1000,
'process_timeout': 30,
'heartbeat_interval': 1.0
},
'audio': {
'sample_rate': 16000,
'channels': 1,
'chunk_size': 1024
},
'recording': {
'min_duration': 2.0,
'max_duration': 30.0,
'silence_threshold': 3.0
},
'processing': {
'enable_asr': True,
'enable_llm': True,
'enable_tts': True,
'character': 'libai'
}
}
def _setup_api_config(self) -> Dict[str, Any]:
"""设置API配置"""
config = {
'asr': {
'appid': "8718217928",
'token': "ynJMX-5ix1FsJvswC9KTNlGUdubcchqc",
'cluster': "volcengine_input_common",
'ws_url': "wss://openspeech.bytedance.com/api/v2/asr"
},
'llm': {
'api_url': "https://ark.cn-beijing.volces.com/api/v3/chat/completions",
'model': "doubao-seed-1-6-flash-250828",
'api_key': os.environ.get("ARK_API_KEY", ""),
'max_tokens': 50
},
'tts': {
'url': "https://openspeech.bytedance.com/api/v3/tts/unidirectional",
'app_id': "8718217928",
'access_key': "ynJMX-5ix1FsJvswC9KTNlGUdubcchqc",
'resource_id': "volc.service_type.10029",
'app_key': "aGjiRDfUWi",
'speaker': "zh_female_wanqudashu_moon_bigtts"
}
}
# 加载角色配置
character_config = self._load_character_config(self.config['processing']['character'])
if character_config and "voice" in character_config:
config['tts']['speaker'] = character_config["voice"]
return config
def _load_character_config(self, character_name: str) -> Optional[Dict[str, Any]]:
"""加载角色配置"""
characters_dir = os.path.join(os.path.dirname(__file__), "characters")
config_file = os.path.join(characters_dir, f"{character_name}.json")
if not os.path.exists(config_file):
print(f"⚠️ 角色配置文件不存在: {config_file}")
return None
try:
with open(config_file, 'r', encoding='utf-8') as f:
config = json.load(f)
print(f"✅ 加载角色: {config.get('name', character_name)}")
return config
except Exception as e:
print(f"❌ 加载角色配置失败: {e}")
return None
def _check_dependencies(self):
"""检查依赖库"""
missing_deps = []
try:
import pyaudio
except ImportError:
missing_deps.append("pyaudio")
try:
import numpy
except ImportError:
missing_deps.append("numpy")
try:
import requests
except ImportError:
missing_deps.append("requests")
try:
import websockets
except ImportError:
missing_deps.append("websockets")
if missing_deps:
print(f"❌ 缺少依赖库: {', '.join(missing_deps)}")
print("请安装: pip install " + " ".join(missing_deps))
sys.exit(1)
# 检查API密钥
if not self.api_config['llm']['api_key']:
print("⚠️ 未设置 ARK_API_KEY 环境变量,大语言模型功能将被禁用")
self.config['processing']['enable_llm'] = False
def start(self):
"""启动系统"""
print("🚀 启动多进程音频控制系统")
print("=" * 60)
# 创建并启动输入进程
input_config = {
'zcr_min': 2400,
'zcr_max': 12000,
'min_recording_time': self.config['recording']['min_duration'],
'max_recording_time': self.config['recording']['max_duration'],
'silence_threshold': self.config['recording']['silence_threshold'],
'pre_record_duration': 2.0
}
self.input_process = mp.Process(
target=InputProcess(
self.input_command_queue,
self.input_event_queue,
input_config
).run
)
# 创建并启动输出进程
output_config = {
'buffer_size': 1000,
'show_progress': True,
'progress_interval': 100
}
self.output_process = mp.Process(
target=OutputProcess(
self.output_audio_queue,
output_config,
self.output_event_queue # 传递事件队列
).run
)
# 启动进程
self.input_process.start()
self.output_process.start()
print("✅ 所有进程已启动")
print("🎙️ 输入进程:负责录音和语音检测")
print("🔊 输出进程:负责音频播放")
print("🎯 主控制负责协调和AI处理")
print("=" * 60)
# 启动主控制循环
self._control_loop()
def _control_loop(self):
"""主控制循环"""
print("🎯 主控制循环启动")
try:
while self.running:
# 根据状态处理不同逻辑
if self.state == RecordingState.IDLE:
self._handle_idle_state()
elif self.state == RecordingState.RECORDING:
self._handle_recording_state()
elif self.state == RecordingState.PROCESSING:
self._handle_processing_state()
elif self.state == RecordingState.PLAYING:
self._handle_playing_state()
# 检查进程事件
self._check_events()
# 显示状态
self._display_status()
# 控制循环频率
time.sleep(0.1)
except KeyboardInterrupt:
print("\n👋 收到退出信号...")
self.shutdown()
except Exception as e:
print(f"❌ 主控制循环错误: {e}")
self.shutdown()
def _handle_idle_state(self):
"""处理空闲状态"""
if self.state == RecordingState.IDLE:
# 启用输入进程录音功能
self.input_command_queue.put(ControlCommand('enable_recording'))
self.state = RecordingState.RECORDING
print("🎯 状态IDLE → RECORDING")
def _handle_recording_state(self):
"""处理录音状态"""
# 等待输入进程发送录音完成事件
pass
def _handle_processing_state(self):
"""处理状态"""
if not self.processing_complete:
self._process_audio_pipeline()
def _handle_playing_state(self):
"""处理播放状态"""
# 现在主要由输出进程的播放完成事件驱动
pass
def _check_events(self):
"""检查进程事件"""
# 检查输入进程事件
try:
while True:
event = self.input_event_queue.get_nowait()
if event.event_type == 'recording_complete':
print("📡 主控制:收到录音完成事件")
self._handle_recording_complete(event)
except queue.Empty:
pass
# 检查输出进程事件
try:
while True:
event = self.output_event_queue.get_nowait()
if event.event_type == 'playback_complete':
print("📡 主控制:收到播放完成事件")
self._handle_playback_complete(event)
except queue.Empty:
pass
def _handle_recording_complete(self, event: ProcessEvent):
"""处理录音完成事件"""
# 禁用输入进程录音功能
self.input_command_queue.put(ControlCommand('disable_recording'))
# 保存录音数据
self.current_audio_data = event.data
self.current_audio_metadata = event.metadata
# 更新统计
self.stats['total_recording_time'] += event.metadata['duration']
# 切换到处理状态
self.state = RecordingState.PROCESSING
self.processing_complete = False
self.playback_complete = False
print(f"🎯 状态RECORDING → PROCESSING (时长: {event.metadata['duration']:.2f}s)")
def _handle_playback_complete(self, event: ProcessEvent):
"""处理播放完成事件"""
# 标记播放完成
self.playback_complete = True
# 更新统计
self.stats['total_conversations'] += 1
# 切换到空闲状态
self.state = RecordingState.IDLE
print(f"🎯 状态PLAYING → IDLE")
# 重新启用输入进程录音功能
self.input_command_queue.put(ControlCommand('enable_recording'))
def _process_audio_pipeline(self):
"""处理音频流水线STT + LLM + TTS"""
try:
print("🤖 开始处理音频流水线")
# 1. 语音识别 (STT)
if self.config['processing']['enable_asr']:
text = self._speech_to_text(self.current_audio_data)
if not text:
print("❌ 语音识别失败")
self._handle_processing_failure()
return
print(f"📝 识别结果: {text}")
else:
text = "语音识别功能已禁用"
# 2. 大语言模型 (LLM)
if self.config['processing']['enable_llm']:
response = self._call_llm(text)
if not response:
print("❌ 大语言模型调用失败")
self._handle_processing_failure()
return
print(f"💬 AI回复: {response}")
else:
response = "大语言模型功能已禁用"
# 3. 文本转语音 (TTS)
if self.config['processing']['enable_tts']:
success = self._text_to_speech_streaming(response)
if not success:
print("❌ 文本转语音失败")
self._handle_processing_failure()
return
else:
print(" 文本转语音功能已禁用")
# 直接发送结束信号
self.output_audio_queue.put(None)
# 标记处理完成
self.processing_complete = True
self.state = RecordingState.PLAYING
self.stats['successful_processing'] += 1
print("🎯 状态PROCESSING → PLAYING")
except Exception as e:
print(f"❌ 处理流水线错误: {e}")
self._handle_processing_failure()
def _handle_processing_failure(self):
"""处理失败情况"""
self.stats['failed_processing'] += 1
self.state = RecordingState.IDLE
self.processing_complete = True
self.playback_complete = True
print("🎯 状态PROCESSING → IDLE (失败)")
def _speech_to_text(self, audio_data: bytes) -> Optional[str]:
"""语音转文字"""
try:
print(f"🔍 开始语音识别,音频大小: {len(audio_data)} 字节")
result = asyncio.run(self._recognize_audio_async(audio_data))
if result:
print(f"✅ 语音识别成功: {result}")
else:
print(f"❌ 语音识别返回空结果")
return result
except Exception as e:
print(f"❌ 语音识别异常: {e}")
import traceback
print(f"❌ 详细错误信息:\n{traceback.format_exc()}")
return None
async def _recognize_audio_async(self, audio_data: bytes) -> Optional[str]:
"""异步语音识别"""
if not self.config['processing']['enable_asr']:
return "语音识别功能已禁用"
try:
# 验证音频数据
print(f"🎵 音频数据验证:")
print(f" - 大小: {len(audio_data)} 字节")
print(f" - 是否为空: {len(audio_data) == 0}")
if len(audio_data) == 0:
print("❌ 音频数据为空")
return None
# 检查是否有WAV头部
has_wav_header = audio_data.startswith(b'RIFF')
print(f" - 有WAV头部: {has_wav_header}")
if has_wav_header:
# 解析WAV头部
print(f" - WAV格式可能需要提取PCM数据")
riff_size = int.from_bytes(audio_data[4:8], 'little')
wave_fmt = audio_data[8:12]
if wave_fmt == b'WAVE':
print(f" - WAVE格式正确")
# 查找fmt块
fmt_pos = audio_data.find(b'fmt ')
if fmt_pos > 0:
fmt_size = int.from_bytes(audio_data[fmt_pos+4:fmt_pos+8], 'little')
audio_format = int.from_bytes(audio_data[fmt_pos+8:fmt_pos+10], 'little')
channels = int.from_bytes(audio_data[fmt_pos+10:fmt_pos+12], 'little')
sample_rate = int.from_bytes(audio_data[fmt_pos+12:fmt_pos+16], 'little')
print(f" - 音频格式: {audio_format}")
print(f" - 声道数: {channels}")
print(f" - 采样率: {sample_rate}")
else:
print(f" - 纯PCM数据")
# 检查音频数据格式假设是16位PCM
if len(audio_data) % 2 != 0:
print(f"⚠️ 音频数据长度不是2的倍数: {len(audio_data)}")
# 计算音频时长
sample_rate = self.config['audio']['sample_rate']
channels = self.config['audio']['channels']
bytes_per_second = sample_rate * channels * 2 # 16位 = 2字节
duration = len(audio_data) / bytes_per_second
print(f" - 配置采样率: {sample_rate} Hz")
print(f" - 配置声道数: {channels}")
print(f" - 估算时长: {duration:.2f}")
if duration < 0.5:
print(f"⚠️ 音频时长过短: {duration:.2f}")
import websockets
print(f"🔗 连接WebSocket ASR服务: {self.api_config['asr']['ws_url']}")
# 生成ASR头部
def generate_asr_header(message_type=1, message_type_specific_flags=0):
PROTOCOL_VERSION = 0b0001
DEFAULT_HEADER_SIZE = 0b0001
JSON = 0b0001
GZIP = 0b0001
header = bytearray()
header.append((PROTOCOL_VERSION << 4) | DEFAULT_HEADER_SIZE)
header.append((message_type << 4) | message_type_specific_flags)
header.append((JSON << 4) | GZIP)
header.append(0x00) # reserved
return header
# 解析ASR响应 - 基于recorder.py的工作实现
def parse_asr_response(res):
"""解析ASR响应"""
PROTOCOL_VERSION = res[0] >> 4
header_size = res[0] & 0x0f
message_type = res[1] >> 4
message_type_specific_flags = res[1] & 0x0f
serialization_method = res[2] >> 4
message_compression = res[2] & 0x0f
reserved = res[3]
header_extensions = res[4:header_size * 4]
payload = res[header_size * 4:]
result = {}
payload_msg = None
payload_size = 0
print(f"🔍 响应头信息: message_type={message_type}, compression={message_compression}, serialization={serialization_method}")
if message_type == 0b1001: # SERVER_FULL_RESPONSE
payload_size = int.from_bytes(payload[:4], "big", signed=True)
payload_msg = payload[4:]
print(f"📦 Full响应: payload_size={payload_size}")
elif message_type == 0b1011: # SERVER_ACK
seq = int.from_bytes(payload[:4], "big", signed=True)
result['seq'] = seq
if len(payload) >= 8:
payload_size = int.from_bytes(payload[4:8], "big", signed=False)
payload_msg = payload[8:]
print(f"📦 ACK响应: seq={seq}, payload_size={payload_size}")
elif message_type == 0b1111: # SERVER_ERROR_RESPONSE
code = int.from_bytes(payload[:4], "big", signed=False)
result['code'] = code
payload_size = int.from_bytes(payload[4:8], "big", signed=False)
payload_msg = payload[8:]
print(f"❌ 错误响应: code={code}")
if payload_msg is None:
return result
if message_compression == 0b0001: # GZIP
payload_msg = gzip.decompress(payload_msg)
print(f"📦 解压后大小: {len(payload_msg)} 字节")
if serialization_method == 0b0001: # JSON
payload_msg = json.loads(str(payload_msg, "utf-8"))
print(f"📋 解析后的JSON: {json.dumps(payload_msg, indent=2, ensure_ascii=False)}")
result['payload_msg'] = payload_msg
result['payload_size'] = payload_size
return result
# 构建请求参数
reqid = str(uuid.uuid4())
request_params = {
'app': {
'appid': self.api_config['asr']['appid'],
'cluster': self.api_config['asr']['cluster'],
'token': self.api_config['asr']['token'],
},
'user': {
'uid': 'multiprocess_asr'
},
'request': {
'reqid': reqid,
'nbest': 1,
'workflow': 'audio_in,resample,partition,vad,fe,decode,itn,nlu_punctuate',
'show_language': False,
'show_utterances': False,
'result_type': 'full',
"sequence": 1
},
'audio': {
'format': 'pcm',
'rate': self.config['audio']['sample_rate'],
'language': 'zh-CN',
'bits': 16,
'channel': self.config['audio']['channels'],
'codec': 'raw'
}
}
# 构建请求
print(f"📋 ASR请求参数:")
print(f" - audio.format: {request_params['audio']['format']}")
print(f" - audio.rate: {request_params['audio']['rate']}")
print(f" - audio.channel: {request_params['audio']['channel']}")
print(f" - audio.bits: {request_params['audio']['bits']}")
print(f" - audio.codec: {request_params['audio']['codec']}")
print(f" - request.workflow: {request_params['request']['workflow']}")
payload_bytes = str.encode(json.dumps(request_params))
payload_bytes = gzip.compress(payload_bytes)
full_client_request = bytearray(generate_asr_header())
full_client_request.extend((len(payload_bytes)).to_bytes(4, 'big'))
full_client_request.extend(payload_bytes)
# 设置认证头
additional_headers = {'Authorization': 'Bearer; {}'.format(self.api_config['asr']['token'])}
# 连接WebSocket
print(f"📡 尝试连接WebSocket...")
print(f"🔗 WebSocket URL: {self.api_config['asr']['ws_url']}")
print(f"📋 Headers: {additional_headers}")
async with websockets.connect(
self.api_config['asr']['ws_url'],
additional_headers=additional_headers,
max_size=1000000000,
ping_interval=20,
ping_timeout=60
) as ws:
print(f"✅ WebSocket连接成功")
# 发送请求
print(f"📤 发送ASR请求...")
print(f"📦 请求大小: {len(full_client_request)} 字节")
await ws.send(full_client_request)
res = await ws.recv()
print(f"📥 收到ASR响应大小: {len(res)} 字节")
result = parse_asr_response(res)
print(f"🔍 解析ASR响应: {result}")
# 发送音频数据 - 基于recorder.py实现
chunk_size = int(self.config['audio']['channels'] * 2 *
self.config['audio']['sample_rate'] * 15000 / 1000)
print(f"🎵 开始发送音频数据:")
print(f" - 总大小: {len(audio_data)} 字节")
print(f" - 分块大小: {chunk_size} 字节")
print(f" - 预计分块数: {(len(audio_data) + chunk_size - 1) // chunk_size}")
total_chunks = (len(audio_data) + chunk_size - 1) // chunk_size
chunks_sent = 0
for offset in range(0, len(audio_data), chunk_size):
chunks_sent += 1
chunk = audio_data[offset:offset + chunk_size]
last = (offset + chunk_size) >= len(audio_data)
print(f"📦 发送第 {chunks_sent}/{total_chunks} 块:")
print(f" - 当前块大小: {len(chunk)} 字节")
print(f" - 偏移量: {offset}-{offset + len(chunk)}")
print(f" - 是否最后一块: {last}")
try:
payload_bytes = gzip.compress(chunk)
audio_only_request = bytearray(
generate_asr_header(
message_type=0b0010,
message_type_specific_flags=0b0010 if last else 0
)
)
audio_only_request.extend((len(payload_bytes)).to_bytes(4, 'big'))
audio_only_request.extend(payload_bytes)
print(f" - 压缩后大小: {len(payload_bytes)} 字节")
print(f" - 总请求数据大小: {len(audio_only_request)} 字节")
await ws.send(audio_only_request)
print(f" ✅ 第 {chunks_sent} 块发送成功")
# 等待服务器响应
try:
res = await asyncio.wait_for(ws.recv(), timeout=30.0)
print(f" 📥 收到第 {chunks_sent} 块响应,大小: {len(res)} 字节")
result = parse_asr_response(res)
print(f" 🔍 第 {chunks_sent} 块响应解析: {result}")
except asyncio.TimeoutError:
print(f" ⏰ 第 {chunks_sent} 块响应超时")
raise Exception("音频块响应超时")
# 检查每个响应是否有错误
if 'code' in result:
print(f" 🔍 第 {chunks_sent} 块响应码: {result['code']}")
if result['code'] != 1000:
print(f" ❌ 第 {chunks_sent} 块数据发送失败: {result}")
return None
if 'payload_msg' in result and result['payload_msg'].get('code') != 1000:
print(f" ❌ 第 {chunks_sent} 块数据发送失败: {result['payload_msg']}")
return None
except Exception as chunk_error:
print(f" ❌ 第 {chunks_sent} 块发送异常: {chunk_error}")
raise chunk_error
if last:
print(f"📨 发送最后一块音频数据完成")
print(f"🎯 所有音频数据发送完成,共发送 {chunks_sent}")
# 检查最后一个响应中是否包含识别结果
print(f"🎯 检查最终识别结果...")
print(f"📋 最后一个响应: {result}")
if 'payload_msg' in result:
payload_msg = result['payload_msg']
print(f"📋 最终Payload结构: {list(payload_msg.keys()) if isinstance(payload_msg, dict) else type(payload_msg)}")
print(f"📋 最终Payload内容: {payload_msg}")
if isinstance(payload_msg, dict):
# 检查响应码
if 'code' in payload_msg:
code = payload_msg['code']
print(f"🔢 最终响应码: {code}")
if code == 1000:
print(f"✅ ASR识别成功")
else:
print(f"❌ ASR服务返回错误: {payload_msg.get('message', '未知错误')}")
return None
# 查找结果 - 与recorder.py保持一致
if 'result' in payload_msg:
results = payload_msg['result']
print(f"📝 找到结果字段 'result': {results}")
if isinstance(results, list) and results:
text = results[0].get('text', '识别失败')
print(f"✅ 提取识别文本: {text}")
return text
elif isinstance(results, str):
print(f"✅ 提取识别文本: {results}")
return results
else:
print(f"❌ 未找到result字段可用字段: {list(payload_msg.keys())}")
print(f"完整payload: {json.dumps(payload_msg, indent=2, ensure_ascii=False)}")
else:
print(f"❌ Payload不是字典类型: {type(payload_msg)}")
else:
print(f"❌ 响应中没有payload_msg字段")
print(f"可用字段: {list(result.keys())}")
if 'code' in result:
print(f"错误码: {result['code']}")
return None
except Exception as e:
print(f"❌ 语音识别失败: {e}")
return None
def _call_llm(self, text: str) -> Optional[str]:
"""调用大语言模型"""
if not self.config['processing']['enable_llm']:
return "大语言模型功能已禁用"
try:
# 获取角色配置
character_config = self._load_character_config(self.config['processing']['character'])
if character_config and "system_prompt" in character_config:
system_prompt = character_config["system_prompt"]
else:
system_prompt = "你是一个智能助手,请根据用户的语音输入提供有帮助的回答。保持回答简洁明了。"
# 构建请求
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_config['llm']['api_key']}"
}
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text}
]
data = {
"model": self.api_config['llm']['model'],
"messages": messages,
"max_tokens": self.api_config['llm']['max_tokens'],
"stream": False # 非流式,简化实现
}
response = requests.post(
self.api_config['llm']['api_url'],
headers=headers,
json=data,
timeout=30
)
if response.status_code == 200:
result = response.json()
if 'choices' in result and len(result['choices']) > 0:
content = result['choices'][0]['message']['content']
return content.strip()
print(f"❌ LLM API调用失败: {response.status_code}")
return None
except Exception as e:
print(f"❌ 大语言模型调用失败: {e}")
return None
def _text_to_speech_streaming(self, text: str) -> bool:
"""文本转语音(流式)"""
if not self.config['processing']['enable_tts']:
return False
try:
print("🎵 开始文本转语音")
print(f"📝 待转换文本: {text}")
# 发送元数据
metadata_msg = f"METADATA:{text[:30]}..."
print(f"📦 发送元数据: {metadata_msg}")
self.output_audio_queue.put(metadata_msg)
# 构建请求头
headers = {
"X-Api-App-Id": self.api_config['tts']['app_id'],
"X-Api-Access-Key": self.api_config['tts']['access_key'],
"X-Api-Resource-Id": self.api_config['tts']['resource_id'],
"X-Api-App-Key": self.api_config['tts']['app_key'],
"Content-Type": "application/json",
"Connection": "keep-alive"
}
# 构建请求参数
payload = {
"user": {
"uid": "multiprocess_tts"
},
"req_params": {
"text": text,
"speaker": self.api_config['tts']['speaker'],
"audio_params": {
"format": "pcm",
"sample_rate": self.config['audio']['sample_rate'],
"enable_timestamp": True
},
"additions": "{\"explicit_language\":\"zh\",\"disable_markdown_filter\":true, \"enable_timestamp\":true}\"}"
}
}
# 发送请求
session = requests.Session()
try:
print(f"🌐 发送TTS请求到: {self.api_config['tts']['url']}")
response = session.post(
self.api_config['tts']['url'],
headers=headers,
json=payload,
stream=True
)
if response.status_code != 200:
print(f"❌ TTS请求失败: {response.status_code}")
return False
print(f"✅ TTS请求成功开始接收音频流")
# 处理流式响应
total_audio_size = 0
chunk_count = 0
queue_size_before = self.output_audio_queue.qsize()
for chunk in response.iter_lines(decode_unicode=True):
if not chunk:
continue
print(f"🔍 原始TTS响应块 {chunk_count + 1}: {chunk[:100]}...")
try:
data = json.loads(chunk)
print(f"🔍 解析后的TTS块 {chunk_count + 1}: {data}")
if data.get("code", 0) == 0 and "data" in data and data["data"]:
chunk_audio = base64.b64decode(data["data"])
audio_size = len(chunk_audio)
total_audio_size += audio_size
chunk_count += 1
# 检查队列状态
current_queue_size = self.output_audio_queue.qsize()
print(f"📦 发送音频块 {chunk_count}: {audio_size} 字节, 队列大小: {current_queue_size}")
# 发送到输出进程
self.output_audio_queue.put(chunk_audio)
# 检查是否发送成功
new_queue_size = self.output_audio_queue.qsize()
if new_queue_size == current_queue_size + 1:
print(f"✅ 音频块 {chunk_count} 发送成功")
else:
print(f"⚠️ 音频块 {chunk_count} 发送后队列大小异常: {current_queue_size} -> {new_queue_size}")
# 显示进度
if chunk_count % 5 == 0: # 更频繁显示进度
progress = f"📥 TTS生成: {chunk_count} 块 | {total_audio_size / 1024:.1f} KB"
print(f"\r{progress}", end='', flush=True)
elif data.get("code", 0) == 20000000:
print(f"🏁 收到TTS结束信号")
break
elif data.get("code", 0) > 0:
print(f"❌ TTS错误响应: {data}")
except json.JSONDecodeError as e:
print(f"❌ JSON解析错误: {e}")
print(f"原始数据: {chunk}")
continue
print(f"\n✅ TTS音频生成完成: {chunk_count} 块, {total_audio_size / 1024:.1f} KB")
print(f"📊 队列大小变化: {queue_size_before} -> {self.output_audio_queue.qsize()}")
# 不再在这里发送结束信号,让输出进程自然播放完所有音频
print(f"📦 TTS音频数据已全部发送等待输出进程播放完成")
print(f"📊 音频队列当前大小: {self.output_audio_queue.qsize()}")
return chunk_count > 0
finally:
response.close()
session.close()
except Exception as e:
print(f"❌ 文本转语音失败: {e}")
import traceback
print(f"❌ 详细错误: {traceback.format_exc()}")
return False
def _display_status(self):
"""显示系统状态"""
# 每秒显示一次状态
if hasattr(self, '_last_status_time'):
if time.time() - self._last_status_time < 1.0:
return
self._last_status_time = time.time()
# 状态显示
status_lines = [
f"🎯 状态: {self.state.value}",
f"📊 统计: 对话{self.stats['total_conversations']} | "
f"录音{self.stats['total_recording_time']:.1f}s | "
f"成功{self.stats['successful_processing']} | "
f"失败{self.stats['failed_processing']}"
]
# 队列状态
input_queue_size = self.input_command_queue.qsize()
output_queue_size = self.output_audio_queue.qsize()
if input_queue_size > 0 or output_queue_size > 0:
status_lines.append(f"📦 队列: 输入{input_queue_size} | 输出{output_queue_size}")
# 显示状态
status_str = " | ".join(status_lines)
print(f"\r{status_str}", end='', flush=True)
def shutdown(self):
"""关闭系统"""
print("\n🛑 正在关闭系统...")
self.running = False
# 发送关闭命令
try:
self.input_command_queue.put(ControlCommand('shutdown'))
self.output_audio_queue.put(None)
except:
pass
# 等待进程结束
if self.input_process:
try:
self.input_process.join(timeout=5)
except:
pass
if self.output_process:
try:
self.output_process.join(timeout=5)
except:
pass
# 显示最终统计
print("\n📊 最终统计:")
print(f" 总对话次数: {self.stats['total_conversations']}")
print(f" 总录音时长: {self.stats['total_recording_time']:.1f}")
print(f" 成功处理: {self.stats['successful_processing']}")
print(f" 失败处理: {self.stats['failed_processing']}")
success_rate = (self.stats['successful_processing'] /
max(1, self.stats['successful_processing'] + self.stats['failed_processing']) * 100)
print(f" 成功率: {success_rate:.1f}%")
print("👋 系统已关闭")
def main():
"""主函数"""
import argparse
parser = argparse.ArgumentParser(description='多进程音频控制系统')
parser.add_argument('--character', '-c', type=str, default='libai',
help='选择角色 (默认: libai)')
parser.add_argument('--config', type=str,
help='配置文件路径')
args = parser.parse_args()
# 加载配置
config = None
if args.config:
try:
with open(args.config, 'r', encoding='utf-8') as f:
config = json.load(f)
except Exception as e:
print(f"⚠️ 配置文件加载失败: {e}")
# 创建控制系统
control_system = ControlSystem(config)
# 设置角色
if args.character:
control_system.config['processing']['character'] = args.character
# 启动系统
control_system.start()
if __name__ == "__main__":
main()