透传接口现在支持视觉图片识别

2026-06-07 11:02:10 +08:00 · 2026-06-07 11:02:10 +08:00 · 1fc105a732
commit 1fc105a732
parent f45f55b50a
2 changed files with 37 additions and 4 deletions
--- a/routes/chat.py
+++ b/routes/chat.py
@ -13,7 +13,7 @@ logger = logging.getLogger('app')
 from utils import (
    Message, ChatRequest, ChatResponse, BatchSaveChatRequest, BatchSaveChatResponse
 )
-from utils.api_models import ChatRequestV2, ChatRequestV3
+from utils.api_models import ChatRequestV2, ChatRequestV3, LLMPassthroughRequest
 from utils.fastapi_utils import (
    process_messages,
    create_project_directory, extract_api_key_from_auth, generate_v2_auth_token, fetch_bot_config, fetch_bot_config_from_db,
@ -1004,15 +1004,19 @@ async def build_llm_from_bot_config(bot_id: str, user_identifier: Optional[str]


@router.post("/api/v3/llm/chat/completions")
-async def llm_passthrough_v3(request: ChatRequestV3, authorization: Optional[str] = Header(None)):
+async def llm_passthrough_v3(request: LLMPassthroughRequest, authorization: Optional[str] = Header(None)):
    """LLM passthrough API - direct LLM call, bypassing all agent logic.

    Only model / api_key / model_server are read from the bot's database config
    (resolved via bot_id). Messages are forwarded to the LLM as-is.

+    Supports vision/multimodal input: a message's content can be a plain string
+    or a list of OpenAI-style content parts (text + image_url). Whether images are
+    actually recognized depends on the configured model being vision-capable.
+
    Required Parameters:
        - bot_id: str - target bot id (used to look up LLM config from db)
-        - messages: List[Message] - conversation messages, passed through directly
+        - messages: List[VisionMessage] - conversation messages, passed through directly

    Optional Parameters:
        - stream: bool - whether to stream the output, default false
--- a/utils/api_models.py
+++ b/utils/api_models.py
@ -3,7 +3,7 @@
 API data models and response schemas.
 """

-from typing import Dict, List, Optional, Any, AsyncGenerator
+from typing import Dict, List, Optional, Any, AsyncGenerator, Union
 from pydantic import BaseModel, Field, field_validator, ConfigDict

 class Message(BaseModel):
@ -89,6 +89,35 @@ class ChatRequestV3(BaseModel):
    user_identifier: Optional[str] = None


+class VisionMessage(BaseModel):
+    """Message supporting multimodal content (text + images).
+
+    content accepts either:
+    - a plain string (text only), or
+    - a list of OpenAI-style content parts, e.g.:
+        [
+            {"type": "text", "text": "what is in this image?"},
+            {"type": "image_url", "image_url": {"url": "https://... or data:image/png;base64,..."}}
+        ]
+    """
+    role: str
+    content: Union[str, List[Dict[str, Any]]]
+
+
+class LLMPassthroughRequest(BaseModel):
+    """LLM passthrough request model - supports vision/multimodal content.
+
+    Only bot_id and messages are required. Config (model/api_key/model_server)
+    is resolved from the database via bot_id.
+    """
+    messages: List[VisionMessage]
+    bot_id: str
+    stream: Optional[bool] = False
+    user_identifier: Optional[str] = None
+
+    model_config = ConfigDict(extra='allow')
+
+
 class FileProcessRequest(BaseModel):
    unique_id: str
    files: Optional[Dict[str, List[str]]] = Field(default=None, description="Files organized by key groups. Each key maps to a list of file paths (supports zip files)")