add voice skill

2026-04-01 10:26:17 +08:00 · 2026-04-01 10:26:17 +08:00 · 6d6c7a92ef
commit 6d6c7a92ef
parent a0e0c8c7b6
6 changed files with 231 additions and 5 deletions
--- a/agent/plugin_hook_loader.py
+++ b/agent/plugin_hook_loader.py
@ -166,11 +166,11 @@ async def _execute_command(skill_path: str, command: str, hook_type: str, config
    try:
        # 设置环境变量，传递给子进程
        env = os.environ.copy()
-        env['ASSISTANT_ID'] = getattr(config, 'bot_id', '')
+        env['ASSISTANT_ID'] = str(getattr(config, 'bot_id', ''))
-        env['USER_IDENTIFIER'] = getattr(config, 'user_identifier', '')
+        env['USER_IDENTIFIER'] = str(getattr(config, 'user_identifier', ''))
-        env['TRACE_ID'] = getattr(config, 'trace_id', '')
+        env['TRACE_ID'] = str(getattr(config, 'trace_id', ''))
-        env['SESSION_ID'] = getattr(config, 'session_id', '')
+        env['SESSION_ID'] = str(getattr(config, 'session_id', ''))
-        env['LANGUAGE'] = getattr(config, 'language', '')
+        env['LANGUAGE'] = str(getattr(config, 'language', ''))
        env['HOOK_TYPE'] = hook_type
        # 合并 config 中的自定义 shell 环境变量
--- a/routes/bot_manager.py
+++ b/routes/bot_manager.py
@ -705,6 +705,7 @@ class BotSettingsResponse(BaseModel):
    voice_speaker: Optional[str] = None  # 语音音色
    voice_system_role: Optional[str] = None  # 语音对话系统角色
    voice_speaking_style: Optional[str] = None  # 语音说话风格
    enable_voice: bool = False  # 语音对话开关
    mcp_settings: Optional[str] = None  # MCP 服务器配置 (JSON 字符串)
    updated_at: str
@ -1881,6 +1882,7 @@ async def get_bot_settings(bot_uuid: str, authorization: Optional[str] = Header(
                voice_speaker=settings.get('voice_speaker'),
                voice_system_role=settings.get('voice_system_role'),
                voice_speaking_style=settings.get('voice_speaking_style'),
                enable_voice=settings.get('enable_voice', False),
                mcp_settings=settings.get('mcp_settings'),
                updated_at=datetime_to_str(updated_at)
            )
@ -1998,6 +2000,8 @@ async def update_bot_settings(
        update_json['voice_system_role'] = request.voice_system_role
    if request.voice_speaking_style is not None:
        update_json['voice_speaking_style'] = request.voice_speaking_style
    if request.enable_voice is not None:
        update_json['enable_voice'] = request.enable_voice
    if request.mcp_settings is not None:
        update_json['mcp_settings'] = request.mcp_settings
--- a/routes/voice.py
+++ b/routes/voice.py
@ -5,6 +5,7 @@ import logging
 from typing import Optional
 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
 from pydantic import BaseModel
 from services.voice_session_manager import VoiceSession
 from utils.settings import VOICE_DEFAULT_MODE
@ -13,6 +14,27 @@ logger = logging.getLogger('app')
 router = APIRouter()
 # Global message queue for broadcast feature
 _pending_messages: dict[str, list[str]] = {}
 def _get_queue_key(bot_id: str, user_identifier: str) -> str:
    return f"{bot_id}_{user_identifier}"
 class BroadcastRequest(BaseModel):
    bot_id: str
    user_identifier: str
    message: str
@router.post("/api/v3/voice/broadcast")
 async def voice_broadcast(req: BroadcastRequest):
    """Push a message to be spoken by an active voice session."""
    key = _get_queue_key(req.bot_id, req.user_identifier)
    _pending_messages.setdefault(key, []).append(req.message)
    return {"success": True, "queued": True}
@router.websocket("/api/v3/voice/realtime")
 async def voice_realtime(websocket: WebSocket):
@ -111,6 +133,14 @@ async def voice_realtime(websocket: WebSocket):
                if voice_mode == "lite":
                    from services.voice_lite_session import VoiceLiteSession
                    # Create callback for broadcast messages
                    queue_key = _get_queue_key(bot_id, msg.get("user_identifier", ""))
                    async def get_pending_message() -> Optional[str]:
                        msgs = _pending_messages.get(queue_key, [])
                        return msgs.pop(0) if msgs else None
                    session_kwargs["get_pending_message"] = get_pending_message
                    session = VoiceLiteSession(**session_kwargs)
                    logger.info(f"[Voice] Using lite mode for bot_id={bot_id}")
                else:
@ -118,6 +148,9 @@ async def voice_realtime(websocket: WebSocket):
                try:
                    await session.start()
                    # Clear old messages on new session connection
                    if voice_mode == "lite":
                        _pending_messages[queue_key] = []
                except Exception as e:
                    logger.error(f"Failed to start voice session: {e}", exc_info=True)
                    await send_json({"type": "error", "message": f"Failed to connect: {str(e)}"})
--- a/services/voice_lite_session.py
+++ b/services/voice_lite_session.py
@ -35,6 +35,7 @@ class VoiceLiteSession:
        on_llm_text: Optional[Callable[[str], Awaitable[None]]] = None,
        on_status: Optional[Callable[[str], Awaitable[None]]] = None,
        on_error: Optional[Callable[[str], Awaitable[None]]] = None,
        get_pending_message: Optional[Callable[[], Awaitable[Optional[str]]]] = None,
    ):
        self.bot_id = bot_id
        self.session_id = session_id or str(uuid.uuid4())
@ -52,8 +53,11 @@ class VoiceLiteSession:
        self._on_llm_text = on_llm_text
        self._on_status = on_status
        self._on_error = on_error
        self._get_pending_message = get_pending_message
        self._running = False
        self._status: str = "ready"  # Current session status
        self._idle_check_task: Optional[asyncio.Task] = None
        self._asr_client: Optional[StreamingASRClient] = None
        self._asr_receive_task: Optional[asyncio.Task] = None
        self._agent_task: Optional[asyncio.Task] = None
@ -86,10 +90,17 @@ class VoiceLiteSession:
        self._running = True
        await self._emit_status("ready")
        # Start idle check task for broadcast messages
        if self._get_pending_message:
            self._idle_check_task = asyncio.create_task(self._idle_check_loop())
    async def stop(self) -> None:
        """Gracefully stop the session."""
        self._running = False
        if self._idle_check_task and not self._idle_check_task.done():
            self._idle_check_task.cancel()
        if self._vad_finish_task and not self._vad_finish_task.done():
            self._vad_finish_task.cancel()
@ -511,9 +522,49 @@ class VoiceLiteSession:
                    await self._on_audio(audio_chunk)
    async def _emit_status(self, status: str) -> None:
        self._status = status
        if self._on_status:
            await self._on_status(status)
    async def _emit_error(self, message: str) -> None:
        if self._on_error:
            await self._on_error(message)
    async def _idle_check_loop(self) -> None:
        """Background task: check and play pending broadcast messages when idle."""
        while self._running:
            try:
                await asyncio.sleep(1.0)  # Check every second
                # Check in both "ready" and "idle" states
                if self._status in ("ready", "idle") and self._get_pending_message:
                    msg = await self._get_pending_message()
                    if msg:
                        await self.speak_text(msg)
            except asyncio.CancelledError:
                break
            except Exception as e:
                logger.warning(f"[VoiceLite] Idle check error: {e}")
    async def speak_text(self, text: str) -> None:
        """Play text directly via TTS (skip agent, used for broadcast messages)."""
        if not text.strip():
            return
        logger.info(f"[VoiceLite] Broadcasting: '{text[:80]}'")
        await self._emit_status("speaking")
        try:
            tts_client = StreamingTTSClient(speaker=self._speaker)
            if self._client_sample_rate != 24000:
                async for audio_chunk in tts_client.synthesize_raw(text):
                    if self._on_audio:
                        await self._on_audio(self._resample_output(audio_chunk))
            else:
                async for audio_chunk in tts_client.synthesize(text):
                    if self._on_audio:
                        await self._on_audio(audio_chunk)
        except Exception as e:
            logger.error(f"[VoiceLite] Broadcast TTS error: {e}", exc_info=True)
        finally:
            if self._running:
                await self._emit_status("idle")
--- a/skills/voice-notification/SKILL.md
+++ b/skills/voice-notification/SKILL.md
@ -0,0 +1,57 @@
 ---
 name: voice-notification
 description: Voice Notification - Push voice broadcast messages to active voice sessions for real-time TTS playback
 ---
 # Voice Notification - Voice Broadcast
 Push voice broadcast messages to users' active voice sessions. The message will be played via TTS when the session is in idle state.
 ## Quick Start
 When a user requests to send a voice notification:
 1. Compose the message content
 2. Call voice_notify.py to send the broadcast
 ## Instructions
 ### Tool Path
 ```bash
 python {skill_dir}/scripts/voice_notify.py broadcast --message "Your message here"
 ```
 ### Parameters
 | Parameter | Required | Description |
 |-----------|----------|-------------|
 | `--message` | Yes | The message content to be spoken via TTS |
 ### Response
 - Success: `{"success": true, "queued": true}`
 - Error: `{"success": false, "error": "..."}`
 ## Examples
 **User**: "Send a voice notification: the meeting is starting"
 ```bash
 python {skill_dir}/scripts/voice_notify.py broadcast \
  --message "The meeting is starting soon, please get ready"
 ```
 **User**: "Notify me via voice that my coffee is ready"
 ```bash
 python {skill_dir}/scripts/voice_notify.py broadcast \
  --message "Your coffee is ready, please come pick it up"
 ```
 ## Guidelines
 - The target user must have an active voice session connected to `/api/v3/voice/realtime`
 - The voice session must be in lite mode (`voice_mode: "lite"`)
 - Messages are queued and played when the session enters idle state
 - Keep messages concise for better TTS experience
 - Message language should match the user's preferred language
--- a/skills/voice-notification/scripts/voice_notify.py
+++ b/skills/voice-notification/scripts/voice_notify.py
@ -0,0 +1,81 @@
 #!/usr/bin/env python3
 """Voice notification script for broadcasting messages to active voice sessions."""
 import argparse
 import json
 import os
 import sys
 from urllib.request import Request, urlopen
 from urllib.error import URLError, HTTPError
 # Default API endpoint
 DEFAULT_API_URL = "http://localhost:8001/api/v3/voice/broadcast"
 def broadcast_message(message: str, api_url: str = DEFAULT_API_URL) -> dict:
    """Send a broadcast message to the voice API.
    Args:
        message: The message content to be spoken
        api_url: The API endpoint URL
    Returns:
        Response dict from the API
    """
    bot_id = os.environ.get("BOT_ID", "")
    user_identifier = os.environ.get("USER_IDENTIFIER", "")
    if not bot_id:
        return {"success": False, "error": "BOT_ID environment variable not set"}
    if not user_identifier:
        return {"success": False, "error": "USER_IDENTIFIER environment variable not set"}
    payload = {
        "bot_id": bot_id,
        "user_identifier": user_identifier,
        "message": message
    }
    req = Request(
        api_url,
        data=json.dumps(payload).encode("utf-8"),
        headers={"Content-Type": "application/json"},
        method="POST"
    )
    try:
        with urlopen(req, timeout=10) as response:
            return json.loads(response.read().decode("utf-8"))
    except HTTPError as e:
        return {"success": False, "error": f"HTTP {e.code}: {e.reason}"}
    except URLError as e:
        return {"success": False, "error": f"Connection error: {e.reason}"}
    except Exception as e:
        return {"success": False, "error": str(e)}
 def main():
    parser = argparse.ArgumentParser(description="Voice notification broadcast tool")
    subparsers = parser.add_subparsers(dest="command", help="Available commands")
    # Broadcast command
    broadcast_parser = subparsers.add_parser("broadcast", help="Broadcast a message to active voice session")
    broadcast_parser.add_argument("--message", required=True, help="Message content to be spoken")
    broadcast_parser.add_argument("--api-url", default=DEFAULT_API_URL, help="API endpoint URL")
    args = parser.parse_args()
    if args.command == "broadcast":
        result = broadcast_message(
            message=args.message,
            api_url=args.api_url
        )
        print(json.dumps(result, ensure_ascii=False, indent=2))
        sys.exit(0 if result.get("success") else 1)
    else:
        parser.print_help()
        sys.exit(1)
 if __name__ == "__main__":
    main()