Local-Voice/control_system.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
多进程音频控制系统
实现主控制进程和状态管理
"""

import multiprocessing as mp
import queue
import time
import threading
import requests
import json
import base64
import gzip
import uuid
import asyncio
import websockets
from typing import Optional, Dict, Any, List
from dataclasses import dataclass, asdict
from enum import Enum
import os
import sys

from audio_processes import (
    InputProcess, OutputProcess,
    RecordingState, ControlCommand, ProcessEvent
)

class ControlSystem:
    """主控制系统"""

    def __init__(self, config: Dict[str, Any] = None):
        self.config = config or self._get_default_config()

        # 进程间通信
        self.input_command_queue = mp.Queue(maxsize=100)    # 主进程 → 输入进程
        self.input_event_queue = mp.Queue(maxsize=100)      # 输入进程 → 主进程
        self.output_audio_queue = mp.Queue(maxsize=1000)    # 主进程 → 输出进程

        # 进程
        self.input_process = None
        self.output_process = None

        # 状态管理
        self.state = RecordingState.IDLE
        self.processing_complete = False
        self.playback_complete = False

        # 当前处理的数据
        self.current_audio_data = None
        self.current_audio_metadata = None

        # API配置
        self.api_config = self._setup_api_config()

        # 统计信息
        self.stats = {
            'total_conversations': 0,
            'total_recording_time': 0,
            'successful_processing': 0,
            'failed_processing': 0
        }

        # 运行状态
        self.running = True

        # 检查依赖
        self._check_dependencies()

    def _get_default_config(self) -> Dict[str, Any]:
        """获取默认配置"""
        return {
            'system': {
                'max_queue_size': 1000,
                'process_timeout': 30,
                'heartbeat_interval': 1.0
            },
            'audio': {
                'sample_rate': 16000,
                'channels': 1,
                'chunk_size': 1024
            },
            'recording': {
                'min_duration': 2.0,
                'max_duration': 30.0,
                'silence_threshold': 3.0
            },
            'processing': {
                'enable_asr': True,
                'enable_llm': True,
                'enable_tts': True,
                'character': 'libai'
            }
        }

    def _setup_api_config(self) -> Dict[str, Any]:
        """设置API配置"""
        config = {
            'asr': {
                'appid': "8718217928",
                'token': "ynJMX-5ix1FsJvswC9KTNlGUdubcchqc",
                'cluster': "volcengine_input_common",
                'ws_url': "wss://openspeech.bytedance.com/api/v2/asr"
            },
            'llm': {
                'api_url': "https://ark.cn-beijing.volces.com/api/v3/chat/completions",
                'model': "doubao-seed-1-6-flash-250828",
                'api_key': os.environ.get("ARK_API_KEY", ""),
                'max_tokens': 50
            },
            'tts': {
                'url': "https://openspeech.bytedance.com/api/v3/tts/unidirectional",
                'app_id': "8718217928",
                'access_key': "ynJMX-5ix1FsJvswC9KTNlGUdubcchqc",
                'resource_id': "volc.service_type.10029",
                'app_key': "aGjiRDfUWi",
                'speaker': "zh_female_wanqudashu_moon_bigtts"
            }
        }

        # 加载角色配置
        character_config = self._load_character_config(self.config['processing']['character'])
        if character_config and "voice" in character_config:
            config['tts']['speaker'] = character_config["voice"]

        return config

    def _load_character_config(self, character_name: str) -> Optional[Dict[str, Any]]:
        """加载角色配置"""
        characters_dir = os.path.join(os.path.dirname(__file__), "characters")
        config_file = os.path.join(characters_dir, f"{character_name}.json")

        if not os.path.exists(config_file):
            print(f"⚠️ 角色配置文件不存在: {config_file}")
            return None

        try:
            with open(config_file, 'r', encoding='utf-8') as f:
                config = json.load(f)

            print(f"✅ 加载角色: {config.get('name', character_name)}")
            return config

        except Exception as e:
            print(f"❌ 加载角色配置失败: {e}")
            return None

    def _check_dependencies(self):
        """检查依赖库"""
        missing_deps = []

        try:
            import pyaudio
        except ImportError:
            missing_deps.append("pyaudio")

        try:
            import numpy
        except ImportError:
            missing_deps.append("numpy")

        try:
            import requests
        except ImportError:
            missing_deps.append("requests")

        try:
            import websockets
        except ImportError:
            missing_deps.append("websockets")

        if missing_deps:
            print(f"❌ 缺少依赖库: {', '.join(missing_deps)}")
            print("请安装: pip install " + " ".join(missing_deps))
            sys.exit(1)

        # 检查API密钥
        if not self.api_config['llm']['api_key']:
            print("⚠️ 未设置 ARK_API_KEY 环境变量，大语言模型功能将被禁用")
            self.config['processing']['enable_llm'] = False

    def start(self):
        """启动系统"""
        print("🚀 启动多进程音频控制系统")
        print("=" * 60)

        # 创建并启动输入进程
        input_config = {
            'zcr_min': 2400,
            'zcr_max': 12000,
            'min_recording_time': self.config['recording']['min_duration'],
            'max_recording_time': self.config['recording']['max_duration'],
            'silence_threshold': self.config['recording']['silence_threshold'],
            'pre_record_duration': 2.0
        }

        self.input_process = mp.Process(
            target=InputProcess(
                self.input_command_queue,
                self.input_event_queue,
                input_config
            ).run
        )

        # 创建并启动输出进程
        output_config = {
            'buffer_size': 1000,
            'show_progress': True,
            'progress_interval': 100
        }

        self.output_process = mp.Process(
            target=OutputProcess(
                self.output_audio_queue,
                output_config
            ).run
        )

        # 启动进程
        self.input_process.start()
        self.output_process.start()

        print("✅ 所有进程已启动")
        print("🎙️ 输入进程：负责录音和语音检测")
        print("🔊 输出进程：负责音频播放")
        print("🎯 主控制：负责协调和AI处理")
        print("=" * 60)

        # 启动主控制循环
        self._control_loop()

    def _control_loop(self):
        """主控制循环"""
        print("🎯 主控制循环启动")

        try:
            while self.running:
                # 根据状态处理不同逻辑
                if self.state == RecordingState.IDLE:
                    self._handle_idle_state()

                elif self.state == RecordingState.RECORDING:
                    self._handle_recording_state()

                elif self.state == RecordingState.PROCESSING:
                    self._handle_processing_state()

                elif self.state == RecordingState.PLAYING:
                    self._handle_playing_state()

                # 检查进程事件
                self._check_events()

                # 显示状态
                self._display_status()

                # 控制循环频率
                time.sleep(0.1)

        except KeyboardInterrupt:
            print("\n👋 收到退出信号...")
            self.shutdown()
        except Exception as e:
            print(f"❌ 主控制循环错误: {e}")
            self.shutdown()

    def _handle_idle_state(self):
        """处理空闲状态"""
        if self.state == RecordingState.IDLE:
            # 启用输入进程录音功能
            self.input_command_queue.put(ControlCommand('enable_recording'))
            self.state = RecordingState.RECORDING
            print("🎯 状态：IDLE → RECORDING")

    def _handle_recording_state(self):
        """处理录音状态"""
        # 等待输入进程发送录音完成事件
        pass

    def _handle_processing_state(self):
        """处理状态"""
        if not self.processing_complete:
            self._process_audio_pipeline()

    def _handle_playing_state(self):
        """处理播放状态"""
        # 检查播放是否完成
        if self.output_audio_queue.qsize() == 0 and not self.playback_complete:
            # 等待一小段时间确保播放完成
            time.sleep(0.5)
            if self.output_audio_queue.qsize() == 0:
                self.playback_complete = True
                self.stats['total_conversations'] += 1

    def _check_events(self):
        """检查进程事件"""
        # 检查输入进程事件
        try:
            while True:
                event = self.input_event_queue.get_nowait()

                if event.event_type == 'recording_complete':
                    print("📡 主控制：收到录音完成事件")
                    self._handle_recording_complete(event)

        except queue.Empty:
            pass

    def _handle_recording_complete(self, event: ProcessEvent):
        """处理录音完成事件"""
        # 禁用输入进程录音功能
        self.input_command_queue.put(ControlCommand('disable_recording'))

        # 保存录音数据
        self.current_audio_data = event.data
        self.current_audio_metadata = event.metadata

        # 更新统计
        self.stats['total_recording_time'] += event.metadata['duration']

        # 切换到处理状态
        self.state = RecordingState.PROCESSING
        self.processing_complete = False
        self.playback_complete = False

        print(f"🎯 状态：RECORDING → PROCESSING (时长: {event.metadata['duration']:.2f}s)")

    def _process_audio_pipeline(self):
        """处理音频流水线：STT + LLM + TTS"""
        try:
            print("🤖 开始处理音频流水线")

            # 1. 语音识别 (STT)
            if self.config['processing']['enable_asr']:
                text = self._speech_to_text(self.current_audio_data)
                if not text:
                    print("❌ 语音识别失败")
                    self._handle_processing_failure()
                    return

                print(f"📝 识别结果: {text}")
            else:
                text = "语音识别功能已禁用"

            # 2. 大语言模型 (LLM)
            if self.config['processing']['enable_llm']:
                response = self._call_llm(text)
                if not response:
                    print("❌ 大语言模型调用失败")
                    self._handle_processing_failure()
                    return

                print(f"💬 AI回复: {response}")
            else:
                response = "大语言模型功能已禁用"

            # 3. 文本转语音 (TTS)
            if self.config['processing']['enable_tts']:
                success = self._text_to_speech_streaming(response)
                if not success:
                    print("❌ 文本转语音失败")
                    self._handle_processing_failure()
                    return
            else:
                print("ℹ️ 文本转语音功能已禁用")
                # 直接发送结束信号
                self.output_audio_queue.put(None)

            # 标记处理完成
            self.processing_complete = True
            self.state = RecordingState.PLAYING
            self.stats['successful_processing'] += 1

            print("🎯 状态：PROCESSING → PLAYING")

        except Exception as e:
            print(f"❌ 处理流水线错误: {e}")
            self._handle_processing_failure()

    def _handle_processing_failure(self):
        """处理失败情况"""
        self.stats['failed_processing'] += 1
        self.state = RecordingState.IDLE
        self.processing_complete = True
        self.playback_complete = True
        print("🎯 状态：PROCESSING → IDLE (失败)")

    def _speech_to_text(self, audio_data: bytes) -> Optional[str]:
        """语音转文字"""
        try:
            return asyncio.run(self._recognize_audio_async(audio_data))
        except Exception as e:
            print(f"❌ 语音识别异常: {e}")
            return None

    async def _recognize_audio_async(self, audio_data: bytes) -> Optional[str]:
        """异步语音识别"""
        if not self.config['processing']['enable_asr']:
            return "语音识别功能已禁用"

        try:
            import websockets

            # 生成ASR头部
            def generate_asr_header(message_type=1, message_type_specific_flags=0):
                PROTOCOL_VERSION = 0b0001
                DEFAULT_HEADER_SIZE = 0b0001
                JSON = 0b0001
                GZIP = 0b0001

                header = bytearray()
                header.append((PROTOCOL_VERSION << 4) | DEFAULT_HEADER_SIZE)
                header.append((message_type << 4) | message_type_specific_flags)
                header.append((JSON << 4) | GZIP)
                header.append(0x00)  # reserved
                return header

            # 解析ASR响应
            def parse_asr_response(res):
                # 简化的响应解析
                if len(res) < 8:
                    return {}

                message_type = res[1] >> 4
                payload_size = int.from_bytes(res[4:8], "big", signed=False)
                payload_msg = res[8:8+payload_size]

                if message_type == 0b1001:  # SERVER_FULL_RESPONSE
                    try:
                        if payload_msg.startswith(b'{'):
                            result = json.loads(payload_msg.decode('utf-8'))
                            return result
                    except:
                        pass

                return {}

            # 构建请求参数
            reqid = str(uuid.uuid4())
            request_params = {
                'app': {
                    'appid': self.api_config['asr']['appid'],
                    'cluster': self.api_config['asr']['cluster'],
                    'token': self.api_config['asr']['token'],
                },
                'user': {
                    'uid': 'multiprocess_asr'
                },
                'request': {
                    'reqid': reqid,
                    'nbest': 1,
                    'workflow': 'audio_in,resample,partition,vad,fe,decode,itn,nlu_punctuate',
                    'show_language': False,
                    'show_utterances': False,
                    'result_type': 'full',
                    "sequence": 1
                },
                'audio': {
                    'format': 'wav',
                    'rate': self.config['audio']['sample_rate'],
                    'language': 'zh-CN',
                    'bits': 16,
                    'channel': self.config['audio']['channels'],
                    'codec': 'raw'
                }
            }

            # 构建请求
            payload_bytes = str.encode(json.dumps(request_params))
            payload_bytes = gzip.compress(payload_bytes)
            full_client_request = bytearray(generate_asr_header())
            full_client_request.extend((len(payload_bytes)).to_bytes(4, 'big'))
            full_client_request.extend(payload_bytes)

            # 设置认证头
            additional_headers = {'Authorization': 'Bearer; {}'.format(self.api_config['asr']['token'])}

            # 连接WebSocket
            async with websockets.connect(
                self.api_config['asr']['ws_url'],
                additional_headers=additional_headers,
                max_size=1000000000
            ) as ws:
                # 发送请求
                await ws.send(full_client_request)
                res = await ws.recv()
                result = parse_asr_response(res)

                # 发送音频数据
                chunk_size = int(self.config['audio']['channels'] * 2 *
                                self.config['audio']['sample_rate'] * 15000 / 1000)

                for offset in range(0, len(audio_data), chunk_size):
                    chunk = audio_data[offset:offset + chunk_size]
                    last = (offset + chunk_size) >= len(audio_data)

                    payload_bytes = gzip.compress(chunk)
                    audio_only_request = bytearray(
                        generate_asr_header(
                            message_type=0b0010,
                            message_type_specific_flags=0b0010 if last else 0
                        )
                    )
                    audio_only_request.extend((len(payload_bytes)).to_bytes(4, 'big'))
                    audio_only_request.extend(payload_bytes)

                    await ws.send(audio_only_request)
                    res = await ws.recv()
                    result = parse_asr_response(res)

                # 获取最终结果
                if 'payload_msg' in result and 'result' in result['payload_msg']:
                    results = result['payload_msg']['result']
                    if results:
                        return results[0].get('text', '识别失败')

                return None

        except Exception as e:
            print(f"❌ 语音识别失败: {e}")
            return None

    def _call_llm(self, text: str) -> Optional[str]:
        """调用大语言模型"""
        if not self.config['processing']['enable_llm']:
            return "大语言模型功能已禁用"

        try:
            # 获取角色配置
            character_config = self._load_character_config(self.config['processing']['character'])
            if character_config and "system_prompt" in character_config:
                system_prompt = character_config["system_prompt"]
            else:
                system_prompt = "你是一个智能助手，请根据用户的语音输入提供有帮助的回答。保持回答简洁明了。"

            # 构建请求
            headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer {self.api_config['llm']['api_key']}"
            }

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": text}
            ]

            data = {
                "model": self.api_config['llm']['model'],
                "messages": messages,
                "max_tokens": self.api_config['llm']['max_tokens'],
                "stream": False  # 非流式，简化实现
            }

            response = requests.post(
                self.api_config['llm']['api_url'],
                headers=headers,
                json=data,
                timeout=30
            )

            if response.status_code == 200:
                result = response.json()
                if 'choices' in result and len(result['choices']) > 0:
                    content = result['choices'][0]['message']['content']
                    return content.strip()

            print(f"❌ LLM API调用失败: {response.status_code}")
            return None

        except Exception as e:
            print(f"❌ 大语言模型调用失败: {e}")
            return None

    def _text_to_speech_streaming(self, text: str) -> bool:
        """文本转语音（流式）"""
        if not self.config['processing']['enable_tts']:
            return False

        try:
            print("🎵 开始文本转语音")

            # 发送元数据
            self.output_audio_queue.put(f"METADATA:{text[:30]}...")

            # 构建请求头
            headers = {
                "X-Api-App-Id": self.api_config['tts']['app_id'],
                "X-Api-Access-Key": self.api_config['tts']['access_key'],
                "X-Api-Resource-Id": self.api_config['tts']['resource_id'],
                "X-Api-App-Key": self.api_config['tts']['app_key'],
                "Content-Type": "application/json",
                "Connection": "keep-alive"
            }

            # 构建请求参数
            payload = {
                "user": {
                    "uid": "multiprocess_tts"
                },
                "req_params": {
                    "text": text,
                    "speaker": self.api_config['tts']['speaker'],
                    "audio_params": {
                        "format": "pcm",
                        "sample_rate": self.config['audio']['sample_rate'],
                        "enable_timestamp": True
                    },
                    "additions": "{\"explicit_language\":\"zh\",\"disable_markdown_filter\":true, \"enable_timestamp\":true}\"}"
                }
            }

            # 发送请求
            session = requests.Session()
            try:
                response = session.post(
                    self.api_config['tts']['url'],
                    headers=headers,
                    json=payload,
                    stream=True
                )

                if response.status_code != 200:
                    print(f"❌ TTS请求失败: {response.status_code}")
                    return False

                # 处理流式响应
                total_audio_size = 0
                chunk_count = 0

                for chunk in response.iter_lines(decode_unicode=True):
                    if not chunk:
                        continue

                    try:
                        data = json.loads(chunk)

                        if data.get("code", 0) == 0 and "data" in data and data["data"]:
                            chunk_audio = base64.b64decode(data["data"])
                            audio_size = len(chunk_audio)
                            total_audio_size += audio_size
                            chunk_count += 1

                            # 发送到输出进程
                            self.output_audio_queue.put(chunk_audio)

                            # 显示进度
                            if chunk_count % 10 == 0:
                                progress = f"📥 TTS生成: {chunk_count} 块 | {total_audio_size / 1024:.1f} KB"
                                print(f"\r{progress}", end='', flush=True)

                        if data.get("code", 0) == 20000000:
                            break

                    except json.JSONDecodeError:
                        continue

                print(f"\n✅ TTS音频生成完成: {chunk_count} 块, {total_audio_size / 1024:.1f} KB")

                # 发送结束信号
                self.output_audio_queue.put(None)

                return chunk_count > 0

            finally:
                response.close()
                session.close()

        except Exception as e:
            print(f"❌ 文本转语音失败: {e}")
            return False

    def _display_status(self):
        """显示系统状态"""
        # 每秒显示一次状态
        if hasattr(self, '_last_status_time'):
            if time.time() - self._last_status_time < 1.0:
                return

        self._last_status_time = time.time()

        # 状态显示
        status_lines = [
            f"🎯 状态: {self.state.value}",
            f"📊 统计: 对话{self.stats['total_conversations']} | "
            f"录音{self.stats['total_recording_time']:.1f}s | "
            f"成功{self.stats['successful_processing']} | "
            f"失败{self.stats['failed_processing']}"
        ]

        # 队列状态
        input_queue_size = self.input_command_queue.qsize()
        output_queue_size = self.output_audio_queue.qsize()

        if input_queue_size > 0 or output_queue_size > 0:
            status_lines.append(f"📦 队列: 输入{input_queue_size} | 输出{output_queue_size}")

        # 显示状态
        status_str = " | ".join(status_lines)
        print(f"\r{status_str}", end='', flush=True)

    def shutdown(self):
        """关闭系统"""
        print("\n🛑 正在关闭系统...")

        self.running = False

        # 发送关闭命令
        try:
            self.input_command_queue.put(ControlCommand('shutdown'))
            self.output_audio_queue.put(None)
        except:
            pass

        # 等待进程结束
        if self.input_process:
            try:
                self.input_process.join(timeout=5)
            except:
                pass

        if self.output_process:
            try:
                self.output_process.join(timeout=5)
            except:
                pass

        # 显示最终统计
        print("\n📊 最终统计:")
        print(f"   总对话次数: {self.stats['total_conversations']}")
        print(f"   总录音时长: {self.stats['total_recording_time']:.1f} 秒")
        print(f"   成功处理: {self.stats['successful_processing']}")
        print(f"   失败处理: {self.stats['failed_processing']}")

        success_rate = (self.stats['successful_processing'] /
                       max(1, self.stats['successful_processing'] + self.stats['failed_processing']) * 100)
        print(f"   成功率: {success_rate:.1f}%")

        print("👋 系统已关闭")

def main():
    """主函数"""
    import argparse

    parser = argparse.ArgumentParser(description='多进程音频控制系统')
    parser.add_argument('--character', '-c', type=str, default='libai',
                       help='选择角色 (默认: libai)')
    parser.add_argument('--config', type=str,
                       help='配置文件路径')

    args = parser.parse_args()

    # 加载配置
    config = None
    if args.config:
        try:
            with open(args.config, 'r', encoding='utf-8') as f:
                config = json.load(f)
        except Exception as e:
            print(f"⚠️ 配置文件加载失败: {e}")

    # 创建控制系统
    control_system = ControlSystem(config)

    # 设置角色
    if args.character:
        control_system.config['processing']['character'] = args.character

    # 启动系统
    control_system.start()

if __name__ == "__main__":
    main()