import os import re import hashlib import json import asyncio from concurrent.futures import ThreadPoolExecutor from typing import List, Dict, Optional, Union, Any import aiohttp from qwen_agent.llm.schema import ASSISTANT, FUNCTION from qwen_agent.llm.oai import TextChatAtOAI from fastapi import HTTPException from utils.logger import logger # 创建全局线程池执行器,用于执行同步的HTTP调用 thread_pool = ThreadPoolExecutor(max_workers=10) # 创建并发信号量,限制同时进行的API调用数量 api_semaphore = asyncio.Semaphore(8) # 最多同时进行8个API调用 def get_versioned_filename(upload_dir: str, name_without_ext: str, file_extension: str) -> tuple[str, int]: """ 获取带版本号的文件名,自动处理文件删除和版本递增 Args: upload_dir: 上传目录路径 name_without_ext: 不含扩展名的文件名 file_extension: 文件扩展名(包含点号) Returns: tuple[str, int]: (最终文件名, 版本号) """ # 检查原始文件是否存在 original_file = os.path.join(upload_dir, name_without_ext + file_extension) original_exists = os.path.exists(original_file) # 查找所有相关的版本化文件 pattern = re.compile(re.escape(name_without_ext) + r'_(\d+)' + re.escape(file_extension) + r'$') existing_versions = [] files_to_delete = [] for filename in os.listdir(upload_dir): # 检查是否是原始文件 if filename == name_without_ext + file_extension: files_to_delete.append(filename) continue # 检查是否是版本化文件 match = pattern.match(filename) if match: version_num = int(match.group(1)) existing_versions.append(version_num) files_to_delete.append(filename) # 如果没有任何相关文件存在,使用原始文件名(版本1) if not original_exists and not existing_versions: return name_without_ext + file_extension, 1 # 删除所有现有文件(原始文件和版本化文件) for filename in files_to_delete: file_to_delete = os.path.join(upload_dir, filename) try: os.remove(file_to_delete) print(f"已删除文件: {file_to_delete}") except OSError as e: print(f"删除文件失败 {file_to_delete}: {e}") # 确定下一个版本号 if existing_versions: next_version = max(existing_versions) + 1 else: next_version = 2 # 生成带版本号的文件名 versioned_filename = f"{name_without_ext}_{next_version}{file_extension}" return versioned_filename, next_version def get_content_from_messages(messages: List[dict], tool_response: bool = True) -> str: """Extract content from qwen-agent messages with special formatting""" full_text = '' content = [] TOOL_CALL_S = '[TOOL_CALL]' TOOL_RESULT_S = '[TOOL_RESPONSE]' THOUGHT_S = '[THINK]' ANSWER_S = '[ANSWER]' for msg in messages: if msg['role'] == ASSISTANT: if msg.get('reasoning_content'): assert isinstance(msg['reasoning_content'], str), 'Now only supports text messages' content.append(f'{THOUGHT_S}\n{msg["reasoning_content"]}') if msg.get('content'): assert isinstance(msg['content'], str), 'Now only supports text messages' # 过滤掉流式输出中的不完整 tool_call 文本 content_text = msg["content"] # 使用正则表达式替换不完整的 tool_call 模式为空字符串 # 匹配并替换不完整的 tool_call 模式 content_text = re.sub(r' List[Dict[str, str]]: """处理消息列表,包括[TOOL_CALL]|[TOOL_RESPONSE]|[ANSWER]分割和语言指令添加 这是 get_content_from_messages 的逆运算,将包含 [TOOL_RESPONSE] 的消息重新组装回 msg['role'] == 'function' 和 msg.get('function_call') 的格式。 """ processed_messages = [] # 收集所有ASSISTANT消息的索引 assistant_indices = [i for i, msg in enumerate(messages) if msg.role == "assistant"] total_assistant_messages = len(assistant_indices) cutoff_point = max(0, total_assistant_messages - 5) # 处理每条消息 for i, msg in enumerate(messages): if msg.role == "assistant": # 确定当前ASSISTANT消息在所有ASSISTANT消息中的位置(从0开始) assistant_position = assistant_indices.index(i) # 使用正则表达式按照 [TOOL_CALL]|[TOOL_RESPONSE]|[ANSWER] 进行切割 parts = re.split(r'\[(TOOL_CALL|TOOL_RESPONSE|ANSWER)\]', msg.content) # 重新组装内容,根据消息位置决定处理方式 filtered_content = "" current_tag = None is_recent_message = assistant_position >= cutoff_point # 最近10条消息 for i in range(0, len(parts)): if i % 2 == 0: # 文本内容 text = parts[i].strip() if not text: continue if current_tag == "TOOL_RESPONSE": if is_recent_message: # 最近10条ASSISTANT消息:保留完整TOOL_RESPONSE信息(使用简略模式) if len(text) <= 500: filtered_content += f"[TOOL_RESPONSE]\n{text}\n" else: # 截取前中后3段内容,每段250字 first_part = text[:250] middle_start = len(text) // 2 - 125 middle_part = text[middle_start:middle_start + 250] last_part = text[-250:] # 计算省略的字数 omitted_count = len(text) - 750 omitted_text = f"...此处省略{omitted_count}字..." # 拼接内容 truncated_text = f"{first_part}\n{omitted_text}\n{middle_part}\n{omitted_text}\n{last_part}" filtered_content += f"[TOOL_RESPONSE]\n{truncated_text}\n" # 10条以上的消息:不保留TOOL_RESPONSE数据(完全跳过) elif current_tag == "TOOL_CALL": if is_recent_message: # 最近10条ASSISTANT消息:保留TOOL_CALL信息 filtered_content += f"[TOOL_CALL]\n{text}\n" # 10条以上的消息:不保留TOOL_CALL数据(完全跳过) elif current_tag == "ANSWER": # 所有ASSISTANT消息都保留ANSWER数据 filtered_content += f"[ANSWER]\n{text}\n" else: # 第一个标签之前的内容 filtered_content += text + "\n" else: # 标签 current_tag = parts[i] # 取最终处理后的内容,去除首尾空白 final_content = filtered_content.strip() if final_content: processed_messages.append({"role": msg.role, "content": final_content}) else: # 如果处理后为空,使用原内容 processed_messages.append({"role": msg.role, "content": msg.content}) else: processed_messages.append({"role": msg.role, "content": msg.content}) # 逆运算:将包含 [TOOL_RESPONSE] 的消息重新组装回 msg['role'] == 'function' 和 msg.get('function_call') # 这是 get_content_from_messages 的逆运算 final_messages = [] for msg in processed_messages: if msg["role"] == ASSISTANT and "[TOOL_RESPONSE]" in msg["content"]: # 分割消息内容 parts = re.split(r'\[(TOOL_CALL|TOOL_RESPONSE|ANSWER)\]', msg["content"]) current_tag = None assistant_content = "" function_calls = [] tool_responses = [] for i in range(0, len(parts)): if i % 2 == 0: # 文本内容 text = parts[i].strip() if not text: continue if current_tag == "TOOL_RESPONSE": # 解析 TOOL_RESPONSE 格式:[TOOL_RESPONSE] function_name\ncontent lines = text.split('\n', 1) function_name = lines[0].strip() if lines else "" response_content = lines[1].strip() if len(lines) > 1 else "" tool_responses.append({ "role": FUNCTION, "name": function_name, "content": response_content }) elif current_tag == "TOOL_CALL": # 解析 TOOL_CALL 格式:[TOOL_CALL] function_name\narguments lines = text.split('\n', 1) function_name = lines[0].strip() if lines else "" arguments = lines[1].strip() if len(lines) > 1 else "" function_calls.append({ "name": function_name, "arguments": arguments }) elif current_tag == "ANSWER": assistant_content += text + "\n" else: # 第一个标签之前的内容也属于 assistant assistant_content += text + "\n" else: # 标签 current_tag = parts[i] # 添加 assistant 消息(如果有内容) if assistant_content.strip() or function_calls: assistant_msg = {"role": ASSISTANT} if assistant_content.strip(): assistant_msg["content"] = assistant_content.strip() if function_calls: # 如果有多个 function_call,只取第一个(兼容原有逻辑) assistant_msg["function_call"] = function_calls[0] final_messages.append(assistant_msg) # 添加所有 tool_responses 作为 function 消息 final_messages.extend(tool_responses) else: # 非 assistant 消息或不包含 [TOOL_RESPONSE] 的消息直接添加 final_messages.append(msg) return final_messages def format_messages_to_chat_history(messages: List[Dict[str, str]]) -> str: """将messages格式化为纯文本聊天记录 Args: messages: 消息列表 Returns: str: 格式化的聊天记录 """ chat_history = [] for message in messages: role = message.get('role', '') content = message.get('content', '') if role == 'user': chat_history.append(f"user: {content}") elif role == 'assistant': chat_history.append(f"assistant: {content}") # 忽略其他角色(如function等) return "\n".join(chat_history) def create_project_directory(dataset_ids: Optional[List[str]], bot_id: str, robot_type: str = "general_agent") -> Optional[str]: """创建项目目录的公共逻辑""" # 只有当 robot_type == "catalog_agent" 且 dataset_ids 不为空时才创建目录 if robot_type != "catalog_agent" or not dataset_ids or len(dataset_ids) == 0: return None try: from utils.multi_project_manager import create_robot_project return create_robot_project(dataset_ids, bot_id) except Exception as e: print(f"Error creating project directory: {e}") return None def extract_api_key_from_auth(authorization: Optional[str]) -> Optional[str]: """从Authorization header中提取API key""" if not authorization: return None # 移除 "Bearer " 前缀 if authorization.startswith("Bearer "): return authorization[7:] else: return authorization def generate_v2_auth_token(bot_id: str) -> str: """生成v2接口的认证token""" masterkey = os.getenv("MASTERKEY", "master") token_input = f"{masterkey}:{bot_id}" return hashlib.md5(token_input.encode()).hexdigest() async def fetch_bot_config(bot_id: str) -> Dict[str, Any]: """获取机器人配置从后端API""" try: backend_host = os.getenv("BACKEND_HOST", "https://api-dev.gptbase.ai") url = f"{backend_host}/v1/agent_bot_config/{bot_id}" auth_token = generate_v2_auth_token(bot_id) headers = { "content-type": "application/json", "authorization": f"Bearer {auth_token}" } # 使用异步HTTP请求 async with aiohttp.ClientSession() as session: async with session.get(url, headers=headers, timeout=30) as response: if response.status != 200: raise HTTPException( status_code=400, detail=f"Failed to fetch bot config: API returned status code {response.status}" ) # 解析响应 response_data = await response.json() if not response_data.get("success"): raise HTTPException( status_code=400, detail=f"Failed to fetch bot config: {response_data.get('message', 'Unknown error')}" ) return response_data.get("data", {}) except aiohttp.ClientError as e: raise HTTPException( status_code=500, detail=f"Failed to connect to backend API: {str(e)}" ) except Exception as e: if isinstance(e, HTTPException): raise raise HTTPException( status_code=500, detail=f"Failed to fetch bot config: {str(e)}" ) def _sync_call_guideline_llm(llm_config, messages) -> str: """同步调用LLM的辅助函数,在线程池中执行""" llm_instance = TextChatAtOAI(llm_config) try: # 设置stream=False来获取非流式响应 response = llm_instance.chat(messages=messages, stream=False) # 处理响应 if isinstance(response, list) and response: # 如果返回的是Message列表,提取内容 if hasattr(response[0], 'content'): return response[0].content elif isinstance(response[0], dict) and 'content' in response[0]: return response[0]['content'] # 如果是字符串,直接返回 if isinstance(response, str): return response # 处理其他类型 return str(response) if response else "" except Exception as e: print(f"Error calling guideline LLM: {e}") return "" async def call_guideline_llm(chat_history: str, guidelines_text: str, terms:str, model_name: str, api_key: str, model_server: str) -> str: """调用大语言模型处理guideline分析 Args: chat_history: 聊天历史记录 guidelines_text: 指导原则文本 model_name: 模型名称 api_key: API密钥 model_server: 模型服务器地址 Returns: str: 模型响应结果 """ # 读取guideline提示词模板 try: with open('./prompt/guideline_prompt.md', 'r', encoding='utf-8') as f: guideline_template = f.read() except Exception as e: print(f"Error reading guideline prompt template: {e}") return "" # 替换模板中的占位符 system_prompt = guideline_template.replace('{chat_history}', chat_history).replace('{guidelines_text}', guidelines_text).replace('{terms}', terms) # 配置LLM llm_config = { 'model': model_name, 'api_key': api_key, 'model_server': model_server, # 使用传入的model_server参数 } # 调用模型 messages = [{'role': 'user', 'content': system_prompt}] try: # 使用信号量控制并发API调用数量 async with api_semaphore: # 使用线程池执行同步HTTP调用,避免阻塞事件循环 loop = asyncio.get_event_loop() response = await loop.run_in_executor(thread_pool, _sync_call_guideline_llm, llm_config, messages) return response except Exception as e: print(f"Error calling guideline LLM: {e}") return "" def _get_optimal_batch_size(guidelines_count: int) -> int: """根据guidelines数量决定最优批次数量(并发数)""" if guidelines_count <= 10: return 1 elif guidelines_count <= 20: return 2 elif guidelines_count <= 30: return 3 else: return 5 async def process_guideline_batch( guidelines_batch: List[str], chat_history: str, terms: str, model_name: str, api_key: str, model_server: str ) -> str: """处理单个guideline批次""" try: # 调用LLM分析这批guidelines batch_guidelines_text = "\n".join(guidelines_batch) batch_analysis = await call_guideline_llm(chat_history, batch_guidelines_text, terms, model_name, api_key, model_server) # 从响应中提取 ```json 和 ``` 包裹的内容 json_pattern = r'```json\s*\n(.*?)\n```' json_matches = re.findall(json_pattern, batch_analysis, re.DOTALL) if json_matches: try: # 解析第一个找到的JSON对象 json_data = json.loads(json_matches[0]) return json_data # 返回解析后的JSON对象 except json.JSONDecodeError as e: print(f"Error parsing JSON from guideline analysis: {e}") return batch_analysis # 如果JSON解析失败,返回原始文本 else: return batch_analysis # 如果没有找到JSON格式,返回原始文本 except Exception as e: print(f"Error processing guideline batch: {e}") return "" def extract_block_from_system_prompt(system_prompt: Optional[str]) -> tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]: """ 从system prompt中提取guideline和terms内容 Args: system_prompt: 系统提示词 Returns: tuple[str, List[Dict], List[Dict]]: (清理后的system_prompt, guidelines_list, terms_list) """ if not system_prompt: return "", [], [] guidelines_list = [] terms_list = [] # 首先分割所有的代码块 block_pattern = r'```(\w+)\s*\n(.*?)\n```' blocks_to_remove = [] for match in re.finditer(block_pattern, system_prompt, re.DOTALL): block_type, content = match.groups() if block_type == 'guideline': try: guidelines = parse_guidelines_text(content.strip()) guidelines_list.extend(guidelines) blocks_to_remove.append(match.group(0)) except Exception as e: print(f"Error parsing guidelines: {e}") elif block_type == 'terms': try: terms = parse_terms_text(content.strip()) terms_list.extend(terms) blocks_to_remove.append(match.group(0)) except Exception as e: print(f"Error parsing terms: {e}") # 从system_prompt中移除这些已解析的块 cleaned_prompt = system_prompt for block in blocks_to_remove: cleaned_prompt = cleaned_prompt.replace(block, '', 1) # 清理多余的空行 cleaned_prompt = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_prompt).strip() return cleaned_prompt, guidelines_list, terms_list def parse_guidelines_text(text: str) -> List[Dict[str, Any]]: """ 解析guidelines文本,支持多种格式 Args: text: guidelines文本内容 Returns: List[Dict]: guidelines列表 """ guidelines = [] # 尝试解析JSON格式 if text.strip().startswith('[') or text.strip().startswith('{'): try: data = json.loads(text) if isinstance(data, list): for item in data: if isinstance(item, dict): guidelines.append(item) elif isinstance(data, dict): guidelines.append(data) return guidelines except json.JSONDecodeError: pass # 解析行格式,支持多种分隔符 lines = [line.strip() for line in text.split('\n') if line.strip()] for line in lines: # 跳过注释行 if line.startswith('#') or line.startswith('//'): continue # 尝试解析 "id) Condition: ... Action: ..." 格式 id_condition_action_pattern = r'(\d+)\)\s*Condition:\s*(.*?)\s*Action:\s*(.*?)(?:\s*Priority:\s*(\d+))?$' match = re.match(id_condition_action_pattern, line, re.IGNORECASE) if match: guidelines.append({ 'id': int(match.group(1)), 'condition': match.group(2).strip(), 'action': match.group(3).strip(), 'priority': int(match.group(4)) if match.group(4) else 1 }) continue # 尝试解析 "condition -> action" 格式 arrow_pattern = r'(?:\d+\)\s*)?(.*?)\s*->\s*(.*?)(?:\s*\[(\d+)\])?$' match = re.match(arrow_pattern, line, re.IGNORECASE) if match: guidelines.append({ 'id': len(guidelines) + 1, 'condition': match.group(1).strip(), 'action': match.group(2).strip(), 'priority': int(match.group(3)) if match.group(3) else 1 }) continue # 尝试解析 "if condition then action" 格式 if_then_pattern = r'(?:\d+\)\s*)?if\s+(.*?)\s+then\s+(.*?)(?:\s*\[(\d+)\])?$' match = re.match(if_then_pattern, line, re.IGNORECASE) if match: guidelines.append({ 'id': len(guidelines) + 1, 'condition': match.group(1).strip(), 'action': match.group(2).strip(), 'priority': int(match.group(3)) if match.group(3) else 1 }) continue # 默认格式:整行作为action,condition为空 guidelines.append({ 'id': len(guidelines) + 1, 'condition': '', 'action': line.strip(), 'priority': 1 }) return guidelines def parse_terms_text(text: str) -> List[Dict[str, Any]]: """ 解析terms文本,支持多种格式 Args: text: terms文本内容 Returns: List[Dict]: terms列表 """ terms = [] # 尝试解析JSON格式 if text.strip().startswith('[') or text.strip().startswith('{'): try: data = json.loads(text) if isinstance(data, list): for item in data: if isinstance(item, dict): terms.append(item) elif isinstance(data, dict): terms.append(data) return terms except json.JSONDecodeError: pass # 解析行格式,支持多种分隔符 lines = [line.strip() for line in text.split('\n') if line.strip()] current_term = {} for line in lines: # 跳过注释行 if line.startswith('#') or line.startswith('//'): continue # 尝试解析 "1) Name: term_name1, Description: desc, Synonyms: syn1, syn2" 格式 numbered_term_pattern = r'(?:\d+\)\s*)?Name:\s*([^,]+)(?:,\s*Description:\s*([^,]+))?(?:,\s*Synonyms:\s*(.+))?' match = re.match(numbered_term_pattern, line, re.IGNORECASE) if match: name = match.group(1).strip() description = match.group(2).strip() if match.group(2) else '' synonyms_text = match.group(3).strip() if match.group(3) else '' # 构建term对象 term_data = {'name': name} if description: term_data['description'] = description if synonyms_text: synonyms = re.split(r'[,;|]', synonyms_text) term_data['synonyms'] = [s.strip() for s in synonyms if s.strip()] if current_term: # 保存之前的term terms.append(current_term) current_term = term_data continue # 尝试解析 "| value" 格式(简化格式) if line.startswith('|'): parts = [p.strip() for p in line[1:].split('|', 2)] # 最多分割3部分 if len(parts) >= 1: if current_term: terms.append(current_term) current_term = {'name': parts[0]} if len(parts) >= 2: current_term['description'] = parts[1] if len(parts) >= 3: synonyms = re.split(r'[,;|]', parts[2]) current_term['synonyms'] = [s.strip() for s in synonyms if s.strip()] continue # 添加最后一个term if current_term: terms.append(current_term) return terms