qwen_agent/routes/voice.py
2026-03-21 01:00:02 +08:00

141 lines
4.6 KiB
Python

import asyncio
import base64
import json
import logging
from typing import Optional
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
from services.voice_session_manager import VoiceSession
logger = logging.getLogger('app')
router = APIRouter()
@router.websocket("/api/v3/voice/realtime")
async def voice_realtime(websocket: WebSocket):
"""
WebSocket endpoint for voice realtime dialogue.
Client sends:
- {"type": "start", "bot_id": "xxx", "session_id": "xxx", "user_identifier": "xxx"}
- {"type": "audio", "data": "<base64 pcm audio>"}
- {"type": "text", "content": "text input"}
- {"type": "stop"}
Server sends:
- {"type": "audio", "data": "<base64 pcm audio>"}
- {"type": "asr_text", "text": "recognized text"}
- {"type": "agent_result", "text": "agent answer"}
- {"type": "llm_text", "text": "polished answer"}
- {"type": "status", "status": "ready|listening|thinking|speaking|idle"}
- {"type": "error", "message": "..."}
"""
await websocket.accept()
session: Optional[VoiceSession] = None
async def send_json(data: dict):
try:
await websocket.send_text(json.dumps(data, ensure_ascii=False))
except Exception:
pass
async def on_audio(audio_data: bytes):
"""Forward TTS audio to frontend"""
try:
encoded = base64.b64encode(audio_data).decode('ascii')
await send_json({"type": "audio", "data": encoded})
except Exception as e:
logger.error(f"Error sending audio to client: {e}")
async def on_asr_text(text: str):
await send_json({"type": "asr_text", "text": text})
async def on_agent_result(text: str):
await send_json({"type": "agent_result", "text": text})
async def on_llm_text(text: str):
await send_json({"type": "llm_text", "text": text})
async def on_status(status: str):
await send_json({"type": "status", "status": status})
async def on_error(message: str):
await send_json({"type": "error", "message": message})
try:
while True:
raw = await websocket.receive_text()
try:
msg = json.loads(raw)
except json.JSONDecodeError:
await send_json({"type": "error", "message": "Invalid JSON"})
continue
msg_type = msg.get("type", "")
if msg_type == "start":
# Initialize voice session
if session:
await session.stop()
bot_id = msg.get("bot_id", "")
if not bot_id:
await send_json({"type": "error", "message": "bot_id is required"})
continue
session = VoiceSession(
bot_id=bot_id,
session_id=msg.get("session_id"),
user_identifier=msg.get("user_identifier"),
on_audio=on_audio,
on_asr_text=on_asr_text,
on_agent_result=on_agent_result,
on_llm_text=on_llm_text,
on_status=on_status,
on_error=on_error,
)
try:
await session.start()
except Exception as e:
logger.error(f"Failed to start voice session: {e}", exc_info=True)
await send_json({"type": "error", "message": f"Failed to connect: {str(e)}"})
session = None
elif msg_type == "audio":
if not session:
await send_json({"type": "error", "message": "Session not started"})
continue
audio_b64 = msg.get("data", "")
if audio_b64:
audio_bytes = base64.b64decode(audio_b64)
await session.handle_audio(audio_bytes)
elif msg_type == "text":
if not session:
await send_json({"type": "error", "message": "Session not started"})
continue
content = msg.get("content", "")
if content:
await session.handle_text(content)
elif msg_type == "stop":
if session:
await session.stop()
session = None
break
except WebSocketDisconnect:
logger.info("Voice WebSocket disconnected")
except Exception as e:
logger.error(f"Voice WebSocket error: {e}", exc_info=True)
finally:
if session:
try:
await session.stop()
except Exception:
pass