diff --git a/routes/chat.py b/routes/chat.py index 07819a3..098bd61 100644 --- a/routes/chat.py +++ b/routes/chat.py @@ -13,7 +13,7 @@ logger = logging.getLogger('app') from utils import ( Message, ChatRequest, ChatResponse, BatchSaveChatRequest, BatchSaveChatResponse ) -from utils.api_models import ChatRequestV2, ChatRequestV3 +from utils.api_models import ChatRequestV2, ChatRequestV3, LLMPassthroughRequest from utils.fastapi_utils import ( process_messages, create_project_directory, extract_api_key_from_auth, generate_v2_auth_token, fetch_bot_config, fetch_bot_config_from_db, @@ -1004,15 +1004,19 @@ async def build_llm_from_bot_config(bot_id: str, user_identifier: Optional[str] @router.post("/api/v3/llm/chat/completions") -async def llm_passthrough_v3(request: ChatRequestV3, authorization: Optional[str] = Header(None)): +async def llm_passthrough_v3(request: LLMPassthroughRequest, authorization: Optional[str] = Header(None)): """LLM passthrough API - direct LLM call, bypassing all agent logic. Only model / api_key / model_server are read from the bot's database config (resolved via bot_id). Messages are forwarded to the LLM as-is. + Supports vision/multimodal input: a message's content can be a plain string + or a list of OpenAI-style content parts (text + image_url). Whether images are + actually recognized depends on the configured model being vision-capable. + Required Parameters: - bot_id: str - target bot id (used to look up LLM config from db) - - messages: List[Message] - conversation messages, passed through directly + - messages: List[VisionMessage] - conversation messages, passed through directly Optional Parameters: - stream: bool - whether to stream the output, default false diff --git a/utils/api_models.py b/utils/api_models.py index a65b2f1..d0f6fd0 100644 --- a/utils/api_models.py +++ b/utils/api_models.py @@ -3,7 +3,7 @@ API data models and response schemas. """ -from typing import Dict, List, Optional, Any, AsyncGenerator +from typing import Dict, List, Optional, Any, AsyncGenerator, Union from pydantic import BaseModel, Field, field_validator, ConfigDict class Message(BaseModel): @@ -89,6 +89,35 @@ class ChatRequestV3(BaseModel): user_identifier: Optional[str] = None +class VisionMessage(BaseModel): + """Message supporting multimodal content (text + images). + + content accepts either: + - a plain string (text only), or + - a list of OpenAI-style content parts, e.g.: + [ + {"type": "text", "text": "what is in this image?"}, + {"type": "image_url", "image_url": {"url": "https://... or data:image/png;base64,..."}} + ] + """ + role: str + content: Union[str, List[Dict[str, Any]]] + + +class LLMPassthroughRequest(BaseModel): + """LLM passthrough request model - supports vision/multimodal content. + + Only bot_id and messages are required. Config (model/api_key/model_server) + is resolved from the database via bot_id. + """ + messages: List[VisionMessage] + bot_id: str + stream: Optional[bool] = False + user_identifier: Optional[str] = None + + model_config = ConfigDict(extra='allow') + + class FileProcessRequest(BaseModel): unique_id: str files: Optional[Dict[str, List[str]]] = Field(default=None, description="Files organized by key groups. Each key maps to a list of file paths (supports zip files)")