diff --git a/routes/chat.py b/routes/chat.py
index 07819a3..098bd61 100644
--- a/routes/chat.py
+++ b/routes/chat.py
@@ -13,7 +13,7 @@ logger = logging.getLogger('app')
 from utils import (
     Message, ChatRequest, ChatResponse, BatchSaveChatRequest, BatchSaveChatResponse
 )
-from utils.api_models import ChatRequestV2, ChatRequestV3
+from utils.api_models import ChatRequestV2, ChatRequestV3, LLMPassthroughRequest
 from utils.fastapi_utils import (
     process_messages,
     create_project_directory, extract_api_key_from_auth, generate_v2_auth_token, fetch_bot_config, fetch_bot_config_from_db,
@@ -1004,15 +1004,19 @@ async def build_llm_from_bot_config(bot_id: str, user_identifier: Optional[str]
 
 
 @router.post("/api/v3/llm/chat/completions")
-async def llm_passthrough_v3(request: ChatRequestV3, authorization: Optional[str] = Header(None)):
+async def llm_passthrough_v3(request: LLMPassthroughRequest, authorization: Optional[str] = Header(None)):
     """LLM passthrough API - direct LLM call, bypassing all agent logic.
 
     Only model / api_key / model_server are read from the bot's database config
     (resolved via bot_id). Messages are forwarded to the LLM as-is.
 
+    Supports vision/multimodal input: a message's content can be a plain string
+    or a list of OpenAI-style content parts (text + image_url). Whether images are
+    actually recognized depends on the configured model being vision-capable.
+
     Required Parameters:
         - bot_id: str - target bot id (used to look up LLM config from db)
-        - messages: List[Message] - conversation messages, passed through directly
+        - messages: List[VisionMessage] - conversation messages, passed through directly
 
     Optional Parameters:
         - stream: bool - whether to stream the output, default false
diff --git a/utils/api_models.py b/utils/api_models.py
index a65b2f1..d0f6fd0 100644
--- a/utils/api_models.py
+++ b/utils/api_models.py
@@ -3,7 +3,7 @@
 API data models and response schemas.
 """
 
-from typing import Dict, List, Optional, Any, AsyncGenerator
+from typing import Dict, List, Optional, Any, AsyncGenerator, Union
 from pydantic import BaseModel, Field, field_validator, ConfigDict
 
 class Message(BaseModel):
@@ -89,6 +89,35 @@ class ChatRequestV3(BaseModel):
     user_identifier: Optional[str] = None
 
 
+class VisionMessage(BaseModel):
+    """Message supporting multimodal content (text + images).
+
+    content accepts either:
+    - a plain string (text only), or
+    - a list of OpenAI-style content parts, e.g.:
+        [
+            {"type": "text", "text": "what is in this image?"},
+            {"type": "image_url", "image_url": {"url": "https://... or data:image/png;base64,..."}}
+        ]
+    """
+    role: str
+    content: Union[str, List[Dict[str, Any]]]
+
+
+class LLMPassthroughRequest(BaseModel):
+    """LLM passthrough request model - supports vision/multimodal content.
+
+    Only bot_id and messages are required. Config (model/api_key/model_server)
+    is resolved from the database via bot_id.
+    """
+    messages: List[VisionMessage]
+    bot_id: str
+    stream: Optional[bool] = False
+    user_identifier: Optional[str] = None
+
+    model_config = ConfigDict(extra='allow')
+
+
 class FileProcessRequest(BaseModel):
     unique_id: str
     files: Optional[Dict[str, List[str]]] = Field(default=None, description="Files organized by key groups. Each key maps to a list of file paths (supports zip files)")