diff --git a/prompt/preamble_prompt.md b/prompt/preamble_prompt.md
new file mode 100644
index 0000000..955d6c9
--- /dev/null
+++ b/prompt/preamble_prompt.md
@@ -0,0 +1,41 @@
+You are an AI agent that is expected to generate a preamble message for the customer.
+
+The actual message will be sent later by a smarter agent. Your job is only to generate the right preamble in order to save time.
+
+
+These are the preamble messages you can choose from. You must ONLY choose one of these: ###
+{preamble_choices_text}
+###
+
+Basically, the preamble is something very short that continues the interaction naturally, without committing to any later action or response.
+We leave that later response to another agent. Make sure you understand this.
+
+Instructions:
+- Note that some of the choices are more generic, and some are more specific to a particular scenario.
+- If you're unsure what to choose --> prefer to go with a more generic, bland choice. This should be 80% of cases.
+  Examples of generic choices: "Hey there!", "Just a moment.", "Hello.", "Got it."
+- If you see clear value in saying something more specific and nuanced --> then go with a more specific choice. This should be 20% or less of cases.
+  Examples of specific choices: "Let me check that for you.", "Sorry to hear that.", "Thanks for your patience."
+
+
+Chat History:
+{chat_history}
+
+User's Last Message:
+{last_message}
+
+OUTPUT FORMAT:
+You must now choose the preamble message. You must produce a JSON object with a single key, "preamble", holding the preamble message as a string,
+EXACTLY as it is given (pay attention to subtleties like punctuation and copy your choice EXACTLY as it is given above).The content in JSON format needs to be wrapped in "```json" and "```".
+```json
+{
+  "preamble": "Just a moment."
+}
+```
+
+You will now be given the current state of the interaction to which you must generate the next preamble message.
+{language}
+
+
+
+
diff --git a/routes/chat.py b/routes/chat.py
index 919b1f0..479df05 100644
--- a/routes/chat.py
+++ b/routes/chat.py
@@ -16,7 +16,7 @@ from utils.api_models import ChatRequestV2
 from utils.fastapi_utils import (
     process_messages, extract_block_from_system_prompt, format_messages_to_chat_history,
     create_project_directory, extract_api_key_from_auth, generate_v2_auth_token, fetch_bot_config,
-    call_guideline_llm, _get_optimal_batch_size, process_guideline_batch, get_content_from_messages
+    call_guideline_llm, _get_optimal_batch_size, process_guideline_batch, get_content_from_messages, call_preamble_llm, get_preamble_text, get_language_text
 )
 
 router = APIRouter()
@@ -56,7 +56,7 @@ def append_user_last_message(messages: list, content: str) -> bool:
     return messages
 
 
-async def generate_stream_response(agent, messages, thought_list, tool_response: bool, model: str):
+async def generate_stream_response(agent, messages, pre_message_list, tool_response: bool, model: str):
     """生成流式响应"""
     accumulated_content = ""
 
@@ -64,8 +64,8 @@ async def generate_stream_response(agent, messages, thought_list, tool_response:
     chunk_id = 0
     try:
 
-        if len(thought_list)>0:
-            accumulated_content = get_content_from_messages(thought_list, tool_response=tool_response)
+        if len(pre_message_list)>0:
+            accumulated_content = get_content_from_messages(pre_message_list, tool_response=tool_response)
             chunk_data = {
                 "id": f"chatcmpl-thought",
                 "object": "chat.completion.chunk",
@@ -167,6 +167,13 @@ async def create_agent_and_generate_response(
     """创建agent并生成响应的公共逻辑"""
     if generate_cfg is None:
         generate_cfg = {}
+    pre_message_list = []
+    query_text = get_user_last_message_content(messages)
+    chat_history = format_messages_to_chat_history(messages)
+    preamble_text = await call_preamble_llm(chat_history, query_text, get_preamble_text(language), language, model_name, api_key, model_server)
+
+    if preamble_text != '':
+        pre_message_list.append({"role": "assistant","content": preamble_text})
 
     # 1. 从system_prompt提取guideline和terms内容
     system_prompt, guidelines_list, terms_list = extract_block_from_system_prompt(system_prompt)
@@ -175,8 +182,6 @@ async def create_agent_and_generate_response(
     terms_analysis = ""
     if terms_list:
         logger.info(f"terms_list: {terms_list}")
-        # 从messages中提取用户的查询文本用于相似性检索
-        query_text = get_user_last_message_content(messages)
         # 使用embedding进行terms处理
         try:
             from embedding.embedding import process_terms_with_embedding
@@ -231,9 +236,6 @@ async def create_agent_and_generate_response(
 
             logger.info(f"Processing {guidelines_count} guidelines in {len(batches)} batches with {batch_count} concurrent batches")
 
-            # 准备chat_history
-            chat_history = format_messages_to_chat_history(messages)
-
             # 并发执行所有任务：guideline批次处理 + agent创建
             tasks = []
 
@@ -313,32 +315,22 @@ async def create_agent_and_generate_response(
             user_identifier=user_identifier
         )
 
-    if language:
-        # 在最后一条消息的末尾追加回复语言
-        language_map = {
-            'zh': '请用中文回复',
-            'en': 'Please reply in English',
-            'ja': '日本語で回答してください',
-            'jp': '日本語で回答してください'
-        }
-        language_instruction = language_map.get(language.lower(), '')
-        if language_instruction:
-            messages = append_user_last_message(messages, f"\n\nlanguage:{language_instruction}")
+    messages = append_user_last_message(messages, f"\n\nlanguage:{get_language_text(language)}")
+
 
-    thought_list = []
     if guideline_analysis != '':
-        thought_list = [{"role": "assistant","reasoning_content": guideline_analysis}]
+        pre_message_list.append({"role": "assistant","reasoning_content": guideline_analysis})
     # 根据stream参数决定返回流式还是非流式响应
     if stream:
         return StreamingResponse(
-            generate_stream_response(agent, messages, thought_list, tool_response, model_name),
+            generate_stream_response(agent, messages, pre_message_list, tool_response, model_name),
             media_type="text/event-stream",
             headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
         )
     else:
         # 非流式响应
         agent_responses = agent.run_nonstream(messages)
-        final_responses = thought_list+agent_responses
+        final_responses = pre_message_list+agent_responses
         if final_responses and len(final_responses) > 0:
             # 使用 get_content_from_messages 处理响应，支持 tool_response 参数
             content = get_content_from_messages(final_responses, tool_response=tool_response)
diff --git a/utils/fastapi_utils.py b/utils/fastapi_utils.py
index 19a95cf..e7f7e53 100644
--- a/utils/fastapi_utils.py
+++ b/utils/fastapi_utils.py
@@ -371,7 +371,7 @@ async def fetch_bot_config(bot_id: str) -> Dict[str, Any]:
         )
 
 
-def _sync_call_guideline_llm(llm_config, messages) -> str:
+def _sync_call_llm(llm_config, messages) -> str:
     """同步调用LLM的辅助函数，在线程池中执行"""
     llm_instance = TextChatAtOAI(llm_config)
     try:
@@ -397,6 +397,132 @@ def _sync_call_guideline_llm(llm_config, messages) -> str:
         logger.error(f"Error calling guideline LLM: {e}")
         return ""
 
+def get_language_text(language: str):
+    if language == "jp":
+        language = "ja"
+    language_map = {
+        'zh': '请用中文回复',
+        'en': 'Please reply in English',
+        'ja': '日本語で回答してください',
+    }
+    return language_map.get(language.lower(), '')
+
+def get_preamble_text(language: str):
+    if language == "jp":
+        language = "ja"
+    preamble_choices_map = {
+        'zh': [
+            "好的，让我来帮您看看。",
+            "明白了，请稍等。",
+            "好的，我理解了。",
+            "没问题，我来处理。",
+            "收到，正在为您查询。",
+            "了解，让我想想。",
+            "好的，我来帮您解答。",
+            "明白了，稍等片刻。",
+            "好的，正在处理中。",
+            "了解了，让我为您分析。"
+        ],
+        'en': [
+            "Just a moment.",
+            "Got it.",
+            "Let me check that for you.",
+            "Sorry to hear that.",
+            "Thanks for your patience.",
+            "I understand.",
+            "Let me help you with that.",
+            "Please wait a moment.",
+            "I'll look into that for you.",
+            "Gotcha, let me see.",
+            "Understood, one moment please.",
+            "I'll help you with this.",
+            "Let me figure that out.",
+            "Thanks for waiting.",
+            "I'll check on that."
+        ],
+        'ja': [
+            "少々お待ちください。",
+            "承知いたしました。",
+            "わかりました。",
+            "確認いたします。",
+            "少々お時間をください。",
+            "了解しました。",
+            "調べてみますね。",
+            "お待たせしました。",
+            "対応いたします。",
+            "わかりましたね。",
+            "承知しました。",
+            "確認させてください。",
+            "少々お待ちいただけますか。",
+            "お調べいたします。",
+            "対応いたしますね。"
+        ]
+    };
+    return "\n".join(preamble_choices_map.get(language.lower(), []))
+
+
+async def call_preamble_llm(chat_history: str, last_message: str, preamble_choices_text: str, language: str, model_name: str, api_key: str, model_server: str) -> str:
+    """调用大语言模型处理guideline分析
+
+    Args:
+        chat_history: 聊天历史记录
+        guidelines_text: 指导原则文本
+        model_name: 模型名称
+        api_key: API密钥
+        model_server: 模型服务器地址
+
+    Returns:
+        str: 模型响应结果
+    """
+    # 读取guideline提示词模板
+    try:
+        with open('./prompt/preamble_prompt.md', 'r', encoding='utf-8') as f:
+            preamble_template = f.read()
+    except Exception as e:
+        logger.error(f"Error reading guideline prompt template: {e}")
+        return ""
+
+    # 替换模板中的占位符
+    system_prompt = preamble_template.replace('{preamble_choices_text}', preamble_choices_text).replace('{chat_history}', chat_history).replace('{last_message}', last_message).replace('{language}', get_language_text(language))
+    # 配置LLM
+    llm_config = {
+        'model': model_name,
+        'api_key': api_key,
+        'model_server': model_server,  # 使用传入的model_server参数
+    }
+
+    # 调用模型
+    messages = [{'role': 'user', 'content': system_prompt}]
+
+    try:
+        # 使用信号量控制并发API调用数量
+        async with api_semaphore:
+            # 使用线程池执行同步HTTP调用，避免阻塞事件循环
+            loop = asyncio.get_event_loop()
+            response = await loop.run_in_executor(thread_pool, _sync_call_llm, llm_config, messages)
+
+            # 从响应中提取 ```json 和 ``` 包裹的内容
+            json_pattern = r'```json\s*\n(.*?)\n```'
+            json_matches = re.findall(json_pattern, response, re.DOTALL)
+
+            if json_matches:
+                try:
+                    # 解析第一个找到的JSON对象
+                    json_data = json.loads(json_matches[0])
+                    logger.info(f"Successfully processed preamble")
+                    return json_data["preamble"]  # 返回解析后的preamble
+                except json.JSONDecodeError as e:
+                    logger.error(f"Error parsing JSON from preamble analysis: {e}")
+                    return ""
+            else:
+                logger.warning(f"No JSON format found in preamble analysis")
+            return ""
+
+    except Exception as e:
+        logger.error(f"Error calling guideline LLM: {e}")
+        return ""
+
+
 
 async def call_guideline_llm(chat_history: str, guidelines_text: str, terms:str, model_name: str, api_key: str, model_server: str) -> str:
     """调用大语言模型处理guideline分析
@@ -437,7 +563,7 @@ async def call_guideline_llm(chat_history: str, guidelines_text: str, terms:str,
         async with api_semaphore:
             # 使用线程池执行同步HTTP调用，避免阻塞事件循环
             loop = asyncio.get_event_loop()
-            response = await loop.run_in_executor(thread_pool, _sync_call_guideline_llm, llm_config, messages)
+            response = await loop.run_in_executor(thread_pool, _sync_call_llm, llm_config, messages)
             return response
 
     except Exception as e: