717 lines
27 KiB
Python
717 lines
27 KiB
Python
import os
|
||
import re
|
||
import hashlib
|
||
import json
|
||
import asyncio
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
from typing import List, Dict, Optional, Union, Any
|
||
import aiohttp
|
||
from qwen_agent.llm.schema import ASSISTANT, FUNCTION
|
||
from qwen_agent.llm.oai import TextChatAtOAI
|
||
from fastapi import HTTPException
|
||
import logging
|
||
|
||
logger = logging.getLogger('app')
|
||
|
||
# 创建全局线程池执行器,用于执行同步的HTTP调用
|
||
thread_pool = ThreadPoolExecutor(max_workers=10)
|
||
|
||
# 创建并发信号量,限制同时进行的API调用数量
|
||
api_semaphore = asyncio.Semaphore(8) # 最多同时进行8个API调用
|
||
|
||
|
||
def get_versioned_filename(upload_dir: str, name_without_ext: str, file_extension: str) -> tuple[str, int]:
|
||
"""
|
||
获取带版本号的文件名,自动处理文件删除和版本递增
|
||
|
||
Args:
|
||
upload_dir: 上传目录路径
|
||
name_without_ext: 不含扩展名的文件名
|
||
file_extension: 文件扩展名(包含点号)
|
||
|
||
Returns:
|
||
tuple[str, int]: (最终文件名, 版本号)
|
||
"""
|
||
# 检查原始文件是否存在
|
||
original_file = os.path.join(upload_dir, name_without_ext + file_extension)
|
||
original_exists = os.path.exists(original_file)
|
||
|
||
# 查找所有相关的版本化文件
|
||
pattern = re.compile(re.escape(name_without_ext) + r'_(\d+)' + re.escape(file_extension) + r'$')
|
||
existing_versions = []
|
||
files_to_delete = []
|
||
|
||
for filename in os.listdir(upload_dir):
|
||
# 检查是否是原始文件
|
||
if filename == name_without_ext + file_extension:
|
||
files_to_delete.append(filename)
|
||
continue
|
||
|
||
# 检查是否是版本化文件
|
||
match = pattern.match(filename)
|
||
if match:
|
||
version_num = int(match.group(1))
|
||
existing_versions.append(version_num)
|
||
files_to_delete.append(filename)
|
||
|
||
# 如果没有任何相关文件存在,使用原始文件名(版本1)
|
||
if not original_exists and not existing_versions:
|
||
return name_without_ext + file_extension, 1
|
||
|
||
# 删除所有现有文件(原始文件和版本化文件)
|
||
for filename in files_to_delete:
|
||
file_to_delete = os.path.join(upload_dir, filename)
|
||
try:
|
||
os.remove(file_to_delete)
|
||
logger.info(f"已删除文件: {file_to_delete}")
|
||
except OSError as e:
|
||
logger.error(f"删除文件失败 {file_to_delete}: {e}")
|
||
|
||
# 确定下一个版本号
|
||
if existing_versions:
|
||
next_version = max(existing_versions) + 1
|
||
else:
|
||
next_version = 2
|
||
|
||
# 生成带版本号的文件名
|
||
versioned_filename = f"{name_without_ext}_{next_version}{file_extension}"
|
||
|
||
return versioned_filename, next_version
|
||
|
||
|
||
def get_content_from_messages(messages: List[dict], tool_response: bool = True) -> str:
|
||
"""Extract content from qwen-agent messages with special formatting"""
|
||
full_text = ''
|
||
content = []
|
||
TOOL_CALL_S = '[TOOL_CALL]'
|
||
TOOL_RESULT_S = '[TOOL_RESPONSE]'
|
||
THOUGHT_S = '[THINK]'
|
||
ANSWER_S = '[ANSWER]'
|
||
|
||
for msg in messages:
|
||
|
||
if msg['role'] == ASSISTANT:
|
||
if msg.get('reasoning_content'):
|
||
assert isinstance(msg['reasoning_content'], str), 'Now only supports text messages'
|
||
content.append(f'{THOUGHT_S}\n{msg["reasoning_content"]}')
|
||
if msg.get('content'):
|
||
assert isinstance(msg['content'], str), 'Now only supports text messages'
|
||
# 过滤掉流式输出中的不完整 tool_call 文本
|
||
content_text = msg["content"]
|
||
|
||
# 使用正则表达式替换不完整的 tool_call 模式为空字符串
|
||
|
||
# 匹配并替换不完整的 tool_call 模式
|
||
content_text = re.sub(r'<t?o?o?l?_?c?a?l?l?$', '', content_text)
|
||
# 只有在处理后内容不为空时才添加
|
||
if content_text.strip():
|
||
content.append(f'{ANSWER_S}\n{content_text}')
|
||
if msg.get('function_call'):
|
||
content_text = msg["function_call"]["arguments"]
|
||
content_text = re.sub(r'}\n<\/?t?o?o?l?_?c?a?l?l?$', '', content_text)
|
||
if content_text.strip():
|
||
content.append(f'{TOOL_CALL_S} {msg["function_call"]["name"]}\n{content_text}')
|
||
elif msg['role'] == FUNCTION:
|
||
if tool_response:
|
||
content.append(f'{TOOL_RESULT_S} {msg["name"]}\n{msg["content"]}')
|
||
else:
|
||
raise TypeError
|
||
|
||
if content:
|
||
full_text = '\n'.join(content)
|
||
|
||
return full_text
|
||
|
||
|
||
def process_messages(messages: List[Dict], language: Optional[str] = None) -> List[Dict[str, str]]:
|
||
"""处理消息列表,包括[TOOL_CALL]|[TOOL_RESPONSE]|[ANSWER]分割和语言指令添加
|
||
|
||
这是 get_content_from_messages 的逆运算,将包含 [TOOL_RESPONSE] 的消息重新组装回
|
||
msg['role'] == 'function' 和 msg.get('function_call') 的格式。
|
||
"""
|
||
processed_messages = []
|
||
|
||
# 收集所有ASSISTANT消息的索引
|
||
assistant_indices = [i for i, msg in enumerate(messages) if msg.role == "assistant"]
|
||
total_assistant_messages = len(assistant_indices)
|
||
cutoff_point = max(0, total_assistant_messages - 5)
|
||
|
||
# 处理每条消息
|
||
for i, msg in enumerate(messages):
|
||
if msg.role == "assistant":
|
||
# 确定当前ASSISTANT消息在所有ASSISTANT消息中的位置(从0开始)
|
||
assistant_position = assistant_indices.index(i)
|
||
|
||
# 使用正则表达式按照 [TOOL_CALL]|[TOOL_RESPONSE]|[ANSWER] 进行切割
|
||
parts = re.split(r'\[(TOOL_CALL|TOOL_RESPONSE|ANSWER)\]', msg.content)
|
||
|
||
# 重新组装内容,根据消息位置决定处理方式
|
||
filtered_content = ""
|
||
current_tag = None
|
||
is_recent_message = assistant_position >= cutoff_point # 最近10条消息
|
||
|
||
for i in range(0, len(parts)):
|
||
if i % 2 == 0: # 文本内容
|
||
text = parts[i].strip()
|
||
if not text:
|
||
continue
|
||
|
||
if current_tag == "TOOL_RESPONSE":
|
||
if is_recent_message:
|
||
# 最近10条ASSISTANT消息:保留完整TOOL_RESPONSE信息(使用简略模式)
|
||
if len(text) <= 500:
|
||
filtered_content += f"[TOOL_RESPONSE]\n{text}\n"
|
||
else:
|
||
# 截取前中后3段内容,每段250字
|
||
first_part = text[:250]
|
||
middle_start = len(text) // 2 - 125
|
||
middle_part = text[middle_start:middle_start + 250]
|
||
last_part = text[-250:]
|
||
|
||
# 计算省略的字数
|
||
omitted_count = len(text) - 750
|
||
omitted_text = f"...此处省略{omitted_count}字..."
|
||
|
||
# 拼接内容
|
||
truncated_text = f"{first_part}\n{omitted_text}\n{middle_part}\n{omitted_text}\n{last_part}"
|
||
filtered_content += f"[TOOL_RESPONSE]\n{truncated_text}\n"
|
||
# 10条以上的消息:不保留TOOL_RESPONSE数据(完全跳过)
|
||
elif current_tag == "TOOL_CALL":
|
||
if is_recent_message:
|
||
# 最近10条ASSISTANT消息:保留TOOL_CALL信息
|
||
filtered_content += f"[TOOL_CALL]\n{text}\n"
|
||
# 10条以上的消息:不保留TOOL_CALL数据(完全跳过)
|
||
elif current_tag == "ANSWER":
|
||
# 所有ASSISTANT消息都保留ANSWER数据
|
||
filtered_content += f"[ANSWER]\n{text}\n"
|
||
else:
|
||
# 第一个标签之前的内容
|
||
filtered_content += text + "\n"
|
||
else: # 标签
|
||
current_tag = parts[i]
|
||
|
||
# 取最终处理后的内容,去除首尾空白
|
||
final_content = filtered_content.strip()
|
||
if final_content:
|
||
processed_messages.append({"role": msg.role, "content": final_content})
|
||
else:
|
||
# 如果处理后为空,使用原内容
|
||
processed_messages.append({"role": msg.role, "content": msg.content})
|
||
else:
|
||
processed_messages.append({"role": msg.role, "content": msg.content})
|
||
|
||
# 逆运算:将包含 [TOOL_RESPONSE] 的消息重新组装回 msg['role'] == 'function' 和 msg.get('function_call')
|
||
# 这是 get_content_from_messages 的逆运算
|
||
final_messages = []
|
||
for msg in processed_messages:
|
||
if msg["role"] == ASSISTANT and "[TOOL_RESPONSE]" in msg["content"]:
|
||
# 分割消息内容
|
||
parts = re.split(r'\[(TOOL_CALL|TOOL_RESPONSE|ANSWER)\]', msg["content"])
|
||
|
||
current_tag = None
|
||
assistant_content = ""
|
||
function_calls = []
|
||
tool_responses = []
|
||
|
||
for i in range(0, len(parts)):
|
||
if i % 2 == 0: # 文本内容
|
||
text = parts[i].strip()
|
||
if not text:
|
||
continue
|
||
|
||
if current_tag == "TOOL_RESPONSE":
|
||
# 解析 TOOL_RESPONSE 格式:[TOOL_RESPONSE] function_name\ncontent
|
||
lines = text.split('\n', 1)
|
||
function_name = lines[0].strip() if lines else ""
|
||
response_content = lines[1].strip() if len(lines) > 1 else ""
|
||
|
||
tool_responses.append({
|
||
"role": FUNCTION,
|
||
"name": function_name,
|
||
"content": response_content
|
||
})
|
||
elif current_tag == "TOOL_CALL":
|
||
# 解析 TOOL_CALL 格式:[TOOL_CALL] function_name\narguments
|
||
lines = text.split('\n', 1)
|
||
function_name = lines[0].strip() if lines else ""
|
||
arguments = lines[1].strip() if len(lines) > 1 else ""
|
||
|
||
function_calls.append({
|
||
"name": function_name,
|
||
"arguments": arguments
|
||
})
|
||
elif current_tag == "ANSWER":
|
||
assistant_content += text + "\n"
|
||
else:
|
||
# 第一个标签之前的内容也属于 assistant
|
||
assistant_content += text + "\n"
|
||
else: # 标签
|
||
current_tag = parts[i]
|
||
|
||
# 添加 assistant 消息(如果有内容)
|
||
if assistant_content.strip() or function_calls:
|
||
assistant_msg = {"role": ASSISTANT}
|
||
if assistant_content.strip():
|
||
assistant_msg["content"] = assistant_content.strip()
|
||
if function_calls:
|
||
# 如果有多个 function_call,只取第一个(兼容原有逻辑)
|
||
assistant_msg["function_call"] = function_calls[0]
|
||
final_messages.append(assistant_msg)
|
||
|
||
# 添加所有 tool_responses 作为 function 消息
|
||
final_messages.extend(tool_responses)
|
||
else:
|
||
# 非 assistant 消息或不包含 [TOOL_RESPONSE] 的消息直接添加
|
||
final_messages.append(msg)
|
||
|
||
return final_messages
|
||
|
||
|
||
|
||
|
||
def format_messages_to_chat_history(messages: List[Dict[str, str]]) -> str:
|
||
"""将messages格式化为纯文本聊天记录
|
||
|
||
Args:
|
||
messages: 消息列表
|
||
|
||
Returns:
|
||
str: 格式化的聊天记录
|
||
"""
|
||
chat_history = []
|
||
|
||
for message in messages:
|
||
role = message.get('role', '')
|
||
content = message.get('content', '')
|
||
|
||
if role == 'user':
|
||
chat_history.append(f"user: {content}")
|
||
elif role == 'assistant':
|
||
chat_history.append(f"assistant: {content}")
|
||
# 忽略其他角色(如function等)
|
||
|
||
return "\n".join(chat_history)
|
||
|
||
|
||
def create_project_directory(dataset_ids: Optional[List[str]], bot_id: str, robot_type: str = "general_agent") -> Optional[str]:
|
||
"""创建项目目录的公共逻辑"""
|
||
# 只有当 robot_type == "catalog_agent" 且 dataset_ids 不为空时才创建目录
|
||
if robot_type != "catalog_agent" or not dataset_ids or len(dataset_ids) == 0:
|
||
return None
|
||
|
||
try:
|
||
from utils.multi_project_manager import create_robot_project
|
||
return create_robot_project(dataset_ids, bot_id)
|
||
except Exception as e:
|
||
logger.error(f"Error creating project directory: {e}")
|
||
return None
|
||
|
||
|
||
def extract_api_key_from_auth(authorization: Optional[str]) -> Optional[str]:
|
||
"""从Authorization header中提取API key"""
|
||
if not authorization:
|
||
return None
|
||
|
||
# 移除 "Bearer " 前缀
|
||
if authorization.startswith("Bearer "):
|
||
return authorization[7:]
|
||
else:
|
||
return authorization
|
||
|
||
|
||
def generate_v2_auth_token(bot_id: str) -> str:
|
||
"""生成v2接口的认证token"""
|
||
masterkey = os.getenv("MASTERKEY", "master")
|
||
token_input = f"{masterkey}:{bot_id}"
|
||
return hashlib.md5(token_input.encode()).hexdigest()
|
||
|
||
|
||
async def fetch_bot_config(bot_id: str) -> Dict[str, Any]:
|
||
"""获取机器人配置从后端API"""
|
||
try:
|
||
backend_host = os.getenv("BACKEND_HOST", "https://api-dev.gptbase.ai")
|
||
url = f"{backend_host}/v1/agent_bot_config/{bot_id}"
|
||
|
||
auth_token = generate_v2_auth_token(bot_id)
|
||
headers = {
|
||
"content-type": "application/json",
|
||
"authorization": f"Bearer {auth_token}"
|
||
}
|
||
# 使用异步HTTP请求
|
||
async with aiohttp.ClientSession() as session:
|
||
async with session.get(url, headers=headers, timeout=30) as response:
|
||
if response.status != 200:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Failed to fetch bot config: API returned status code {response.status}"
|
||
)
|
||
|
||
# 解析响应
|
||
response_data = await response.json()
|
||
|
||
if not response_data.get("success"):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Failed to fetch bot config: {response_data.get('message', 'Unknown error')}"
|
||
)
|
||
|
||
return response_data.get("data", {})
|
||
|
||
except aiohttp.ClientError as e:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"Failed to connect to backend API: {str(e)}"
|
||
)
|
||
except Exception as e:
|
||
if isinstance(e, HTTPException):
|
||
raise
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"Failed to fetch bot config: {str(e)}"
|
||
)
|
||
|
||
|
||
def _sync_call_guideline_llm(llm_config, messages) -> str:
|
||
"""同步调用LLM的辅助函数,在线程池中执行"""
|
||
llm_instance = TextChatAtOAI(llm_config)
|
||
try:
|
||
# 设置stream=False来获取非流式响应
|
||
response = llm_instance.chat(messages=messages, stream=False)
|
||
|
||
# 处理响应
|
||
if isinstance(response, list) and response:
|
||
# 如果返回的是Message列表,提取内容
|
||
if hasattr(response[0], 'content'):
|
||
return response[0].content
|
||
elif isinstance(response[0], dict) and 'content' in response[0]:
|
||
return response[0]['content']
|
||
|
||
# 如果是字符串,直接返回
|
||
if isinstance(response, str):
|
||
return response
|
||
|
||
# 处理其他类型
|
||
return str(response) if response else ""
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error calling guideline LLM: {e}")
|
||
return ""
|
||
|
||
|
||
async def call_guideline_llm(chat_history: str, guidelines_text: str, terms:str, model_name: str, api_key: str, model_server: str) -> str:
|
||
"""调用大语言模型处理guideline分析
|
||
|
||
Args:
|
||
chat_history: 聊天历史记录
|
||
guidelines_text: 指导原则文本
|
||
model_name: 模型名称
|
||
api_key: API密钥
|
||
model_server: 模型服务器地址
|
||
|
||
Returns:
|
||
str: 模型响应结果
|
||
"""
|
||
# 读取guideline提示词模板
|
||
try:
|
||
with open('./prompt/guideline_prompt.md', 'r', encoding='utf-8') as f:
|
||
guideline_template = f.read()
|
||
except Exception as e:
|
||
logger.error(f"Error reading guideline prompt template: {e}")
|
||
return ""
|
||
|
||
# 替换模板中的占位符
|
||
system_prompt = guideline_template.replace('{chat_history}', chat_history).replace('{guidelines_text}', guidelines_text).replace('{terms}', terms)
|
||
|
||
# 配置LLM
|
||
llm_config = {
|
||
'model': model_name,
|
||
'api_key': api_key,
|
||
'model_server': model_server, # 使用传入的model_server参数
|
||
}
|
||
|
||
# 调用模型
|
||
messages = [{'role': 'user', 'content': system_prompt}]
|
||
|
||
try:
|
||
# 使用信号量控制并发API调用数量
|
||
async with api_semaphore:
|
||
# 使用线程池执行同步HTTP调用,避免阻塞事件循环
|
||
loop = asyncio.get_event_loop()
|
||
response = await loop.run_in_executor(thread_pool, _sync_call_guideline_llm, llm_config, messages)
|
||
return response
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error calling guideline LLM: {e}")
|
||
return ""
|
||
|
||
|
||
def _get_optimal_batch_size(guidelines_count: int) -> int:
|
||
"""根据guidelines数量决定最优批次数量(并发数)"""
|
||
if guidelines_count <= 5:
|
||
return 1
|
||
elif guidelines_count <= 10:
|
||
return 2
|
||
elif guidelines_count <= 20:
|
||
return 3
|
||
elif guidelines_count <= 30:
|
||
return 4
|
||
else:
|
||
return 5
|
||
|
||
|
||
async def process_guideline_batch(
|
||
guidelines_batch: List[str],
|
||
chat_history: str,
|
||
terms: str,
|
||
model_name: str,
|
||
api_key: str,
|
||
model_server: str
|
||
) -> str:
|
||
"""处理单个guideline批次"""
|
||
max_retries = 3
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
# 调用LLM分析这批guidelines
|
||
batch_guidelines_text = "\n".join(guidelines_batch)
|
||
logger.info(f"Start processed guideline batch on attempt {attempt + 1}")
|
||
batch_analysis = await call_guideline_llm(chat_history, batch_guidelines_text, terms, model_name, api_key, model_server)
|
||
|
||
# 从响应中提取 ```json 和 ``` 包裹的内容
|
||
json_pattern = r'```json\s*\n(.*?)\n```'
|
||
json_matches = re.findall(json_pattern, batch_analysis, re.DOTALL)
|
||
|
||
if json_matches:
|
||
try:
|
||
# 解析第一个找到的JSON对象
|
||
json_data = json.loads(json_matches[0])
|
||
logger.info(f"Successfully processed guideline batch on attempt {attempt + 1}")
|
||
return json_data # 返回解析后的JSON对象
|
||
except json.JSONDecodeError as e:
|
||
logger.error(f"Error parsing JSON from guideline analysis on attempt {attempt + 1}: {e}")
|
||
if attempt == max_retries - 1:
|
||
return batch_analysis # 最后一次尝试失败,返回原始文本
|
||
continue
|
||
else:
|
||
logger.warning(f"No JSON format found in guideline analysis on attempt {attempt + 1}")
|
||
if attempt == max_retries - 1:
|
||
return batch_analysis # 最后一次尝试失败,返回原始文本
|
||
continue
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error processing guideline batch on attempt {attempt + 1}: {e}")
|
||
if attempt == max_retries - 1:
|
||
return "" # 最后一次尝试失败,返回空字符串
|
||
|
||
# 这里不应该到达,但为了完整性
|
||
return ""
|
||
|
||
|
||
def extract_block_from_system_prompt(system_prompt: Optional[str]) -> tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||
"""
|
||
从system prompt中提取guideline和terms内容
|
||
|
||
Args:
|
||
system_prompt: 系统提示词
|
||
|
||
Returns:
|
||
tuple[str, List[Dict], List[Dict]]: (清理后的system_prompt, guidelines_list, terms_list)
|
||
"""
|
||
if not system_prompt:
|
||
return "", [], []
|
||
|
||
guidelines_list = []
|
||
terms_list = []
|
||
|
||
# 首先分割所有的代码块
|
||
block_pattern = r'```(\w+)\s*\n(.*?)\n```'
|
||
blocks_to_remove = []
|
||
|
||
for match in re.finditer(block_pattern, system_prompt, re.DOTALL):
|
||
block_type, content = match.groups()
|
||
|
||
if block_type == 'guideline':
|
||
try:
|
||
guidelines = parse_guidelines_text(content.strip())
|
||
guidelines_list.extend(guidelines)
|
||
blocks_to_remove.append(match.group(0))
|
||
except Exception as e:
|
||
logger.error(f"Error parsing guidelines: {e}")
|
||
|
||
elif block_type == 'terms':
|
||
try:
|
||
terms = parse_terms_text(content.strip())
|
||
terms_list.extend(terms)
|
||
blocks_to_remove.append(match.group(0))
|
||
except Exception as e:
|
||
logger.error(f"Error parsing terms: {e}")
|
||
|
||
# 从system_prompt中移除这些已解析的块
|
||
cleaned_prompt = system_prompt
|
||
for block in blocks_to_remove:
|
||
cleaned_prompt = cleaned_prompt.replace(block, '', 1)
|
||
|
||
# 清理多余的空行
|
||
cleaned_prompt = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_prompt).strip()
|
||
|
||
return cleaned_prompt, guidelines_list, terms_list
|
||
|
||
|
||
def parse_guidelines_text(text: str) -> List[Dict[str, Any]]:
|
||
"""
|
||
解析guidelines文本,支持多种格式
|
||
|
||
Args:
|
||
text: guidelines文本内容
|
||
|
||
Returns:
|
||
List[Dict]: guidelines列表
|
||
"""
|
||
guidelines = []
|
||
|
||
# 尝试解析JSON格式
|
||
if text.strip().startswith('[') or text.strip().startswith('{'):
|
||
try:
|
||
data = json.loads(text)
|
||
if isinstance(data, list):
|
||
for item in data:
|
||
if isinstance(item, dict):
|
||
guidelines.append(item)
|
||
elif isinstance(data, dict):
|
||
guidelines.append(data)
|
||
return guidelines
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
# 解析行格式,支持多种分隔符
|
||
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
||
|
||
for line in lines:
|
||
# 跳过注释行
|
||
if line.startswith('#') or line.startswith('//'):
|
||
continue
|
||
|
||
# 尝试解析 "id) Condition: ... Action: ..." 格式
|
||
id_condition_action_pattern = r'(\d+)\)\s*Condition:\s*(.*?)\s*Action:\s*(.*?)(?:\s*Priority:\s*(\d+))?$'
|
||
match = re.match(id_condition_action_pattern, line, re.IGNORECASE)
|
||
if match:
|
||
guidelines.append({
|
||
'id': int(match.group(1)),
|
||
'condition': match.group(2).strip(),
|
||
'action': match.group(3).strip(),
|
||
'priority': int(match.group(4)) if match.group(4) else 1
|
||
})
|
||
continue
|
||
|
||
# 尝试解析 "condition -> action" 格式
|
||
arrow_pattern = r'(?:\d+\)\s*)?(.*?)\s*->\s*(.*?)(?:\s*\[(\d+)\])?$'
|
||
match = re.match(arrow_pattern, line, re.IGNORECASE)
|
||
if match:
|
||
guidelines.append({
|
||
'id': len(guidelines) + 1,
|
||
'condition': match.group(1).strip(),
|
||
'action': match.group(2).strip(),
|
||
'priority': int(match.group(3)) if match.group(3) else 1
|
||
})
|
||
continue
|
||
|
||
# 尝试解析 "if condition then action" 格式
|
||
if_then_pattern = r'(?:\d+\)\s*)?if\s+(.*?)\s+then\s+(.*?)(?:\s*\[(\d+)\])?$'
|
||
match = re.match(if_then_pattern, line, re.IGNORECASE)
|
||
if match:
|
||
guidelines.append({
|
||
'id': len(guidelines) + 1,
|
||
'condition': match.group(1).strip(),
|
||
'action': match.group(2).strip(),
|
||
'priority': int(match.group(3)) if match.group(3) else 1
|
||
})
|
||
continue
|
||
|
||
# 默认格式:整行作为action,condition为空
|
||
guidelines.append({
|
||
'id': len(guidelines) + 1,
|
||
'condition': '',
|
||
'action': line.strip(),
|
||
'priority': 1
|
||
})
|
||
|
||
return guidelines
|
||
|
||
|
||
def parse_terms_text(text: str) -> List[Dict[str, Any]]:
|
||
"""
|
||
解析terms文本,支持多种格式
|
||
|
||
Args:
|
||
text: terms文本内容
|
||
|
||
Returns:
|
||
List[Dict]: terms列表
|
||
"""
|
||
terms = []
|
||
|
||
# 尝试解析JSON格式
|
||
if text.strip().startswith('[') or text.strip().startswith('{'):
|
||
try:
|
||
data = json.loads(text)
|
||
if isinstance(data, list):
|
||
for item in data:
|
||
if isinstance(item, dict):
|
||
terms.append(item)
|
||
elif isinstance(data, dict):
|
||
terms.append(data)
|
||
return terms
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
# 解析行格式,支持多种分隔符
|
||
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
||
|
||
current_term = {}
|
||
|
||
for line in lines:
|
||
# 跳过注释行
|
||
if line.startswith('#') or line.startswith('//'):
|
||
continue
|
||
|
||
# 尝试解析 "1) Name: term_name1, Description: desc, Synonyms: syn1, syn2" 格式
|
||
numbered_term_pattern = r'(?:\d+\)\s*)?Name:\s*([^,]+)(?:,\s*Description:\s*([^,]+))?(?:,\s*Synonyms:\s*(.+))?'
|
||
match = re.match(numbered_term_pattern, line, re.IGNORECASE)
|
||
if match:
|
||
name = match.group(1).strip()
|
||
description = match.group(2).strip() if match.group(2) else ''
|
||
synonyms_text = match.group(3).strip() if match.group(3) else ''
|
||
|
||
# 构建term对象
|
||
term_data = {'name': name}
|
||
if description:
|
||
term_data['description'] = description
|
||
if synonyms_text:
|
||
synonyms = re.split(r'[,;|]', synonyms_text)
|
||
term_data['synonyms'] = [s.strip() for s in synonyms if s.strip()]
|
||
|
||
if current_term: # 保存之前的term
|
||
terms.append(current_term)
|
||
current_term = term_data
|
||
continue
|
||
|
||
# 尝试解析 "| value" 格式(简化格式)
|
||
if line.startswith('|'):
|
||
parts = [p.strip() for p in line[1:].split('|', 2)] # 最多分割3部分
|
||
if len(parts) >= 1:
|
||
if current_term:
|
||
terms.append(current_term)
|
||
current_term = {'name': parts[0]}
|
||
if len(parts) >= 2:
|
||
current_term['description'] = parts[1]
|
||
if len(parts) >= 3:
|
||
synonyms = re.split(r'[,;|]', parts[2])
|
||
current_term['synonyms'] = [s.strip() for s in synonyms if s.strip()]
|
||
continue
|
||
|
||
# 添加最后一个term
|
||
if current_term:
|
||
terms.append(current_term)
|
||
|
||
return terms
|