qwen_agent/utils/fastapi_utils.py

import os
import re
import hashlib
import json
import asyncio
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict, Optional, Union, Any
import aiohttp
from qwen_agent.llm.schema import ASSISTANT, FUNCTION
from qwen_agent.llm.oai import TextChatAtOAI
from fastapi import HTTPException
import logging

logger = logging.getLogger('app')

# 创建全局线程池执行器，用于执行同步的HTTP调用
thread_pool = ThreadPoolExecutor(max_workers=10)

# 创建并发信号量，限制同时进行的API调用数量
api_semaphore = asyncio.Semaphore(8)  # 最多同时进行8个API调用


def get_versioned_filename(upload_dir: str, name_without_ext: str, file_extension: str) -> tuple[str, int]:
    """
    获取带版本号的文件名，自动处理文件删除和版本递增

    Args:
        upload_dir: 上传目录路径
        name_without_ext: 不含扩展名的文件名
        file_extension: 文件扩展名（包含点号）

    Returns:
        tuple[str, int]: (最终文件名, 版本号)
    """
    # 检查原始文件是否存在
    original_file = os.path.join(upload_dir, name_without_ext + file_extension)
    original_exists = os.path.exists(original_file)

    # 查找所有相关的版本化文件
    pattern = re.compile(re.escape(name_without_ext) + r'_(\d+)' + re.escape(file_extension) + r'$')
    existing_versions = []
    files_to_delete = []

    for filename in os.listdir(upload_dir):
        # 检查是否是原始文件
        if filename == name_without_ext + file_extension:
            files_to_delete.append(filename)
            continue

        # 检查是否是版本化文件
        match = pattern.match(filename)
        if match:
            version_num = int(match.group(1))
            existing_versions.append(version_num)
            files_to_delete.append(filename)

    # 如果没有任何相关文件存在，使用原始文件名（版本1）
    if not original_exists and not existing_versions:
        return name_without_ext + file_extension, 1

    # 删除所有现有文件（原始文件和版本化文件）
    for filename in files_to_delete:
        file_to_delete = os.path.join(upload_dir, filename)
        try:
            os.remove(file_to_delete)
            logger.info(f"已删除文件: {file_to_delete}")
        except OSError as e:
            logger.error(f"删除文件失败 {file_to_delete}: {e}")

    # 确定下一个版本号
    if existing_versions:
        next_version = max(existing_versions) + 1
    else:
        next_version = 2

    # 生成带版本号的文件名
    versioned_filename = f"{name_without_ext}_{next_version}{file_extension}"

    return versioned_filename, next_version


def get_content_from_messages(messages: List[dict], tool_response: bool = True) -> str:
    """Extract content from qwen-agent messages with special formatting"""
    full_text = ''
    content = []
    TOOL_CALL_S = '[TOOL_CALL]'
    TOOL_RESULT_S = '[TOOL_RESPONSE]'
    THOUGHT_S = '[THINK]'
    ANSWER_S = '[ANSWER]'

    for msg in messages:

        if msg['role'] == ASSISTANT:
            if msg.get('reasoning_content'):
                assert isinstance(msg['reasoning_content'], str), 'Now only supports text messages'
                content.append(f'{THOUGHT_S}\n{msg["reasoning_content"]}')
            if msg.get('content'):
                assert isinstance(msg['content'], str), 'Now only supports text messages'
                # 过滤掉流式输出中的不完整 tool_call 文本
                content_text = msg["content"]

                # 使用正则表达式替换不完整的 tool_call 模式为空字符串

                # 匹配并替换不完整的 tool_call 模式
                content_text = re.sub(r'<t?o?o?l?_?c?a?l?l?$', '', content_text)
                # 只有在处理后内容不为空时才添加
                if content_text.strip():
                    content.append(f'{ANSWER_S}\n{content_text}')
            if msg.get('function_call'):
                content_text = msg["function_call"]["arguments"]
                content_text = re.sub(r'}\n<\/?t?o?o?l?_?c?a?l?l?$', '', content_text)
                if content_text.strip():
                    content.append(f'{TOOL_CALL_S} {msg["function_call"]["name"]}\n{content_text}')
        elif msg['role'] == FUNCTION:
            if tool_response:
                content.append(f'{TOOL_RESULT_S} {msg["name"]}\n{msg["content"]}')
        else:
            raise TypeError

    if content:
        full_text = '\n'.join(content)

    return full_text


def process_messages(messages: List[Dict], language: Optional[str] = None) -> List[Dict[str, str]]:
    """处理消息列表，包括[TOOL_CALL]|[TOOL_RESPONSE]|[ANSWER]分割和语言指令添加

    这是 get_content_from_messages 的逆运算，将包含 [TOOL_RESPONSE] 的消息重新组装回
    msg['role'] == 'function' 和 msg.get('function_call') 的格式。
    """
    processed_messages = []

    # 收集所有ASSISTANT消息的索引
    assistant_indices = [i for i, msg in enumerate(messages) if msg.role == "assistant"]
    total_assistant_messages = len(assistant_indices)
    cutoff_point = max(0, total_assistant_messages - 5)

    # 处理每条消息
    for i, msg in enumerate(messages):
        if msg.role == "assistant":
            # 确定当前ASSISTANT消息在所有ASSISTANT消息中的位置（从0开始）
            assistant_position = assistant_indices.index(i)

            # 使用正则表达式按照 [TOOL_CALL]|[TOOL_RESPONSE]|[ANSWER] 进行切割
            parts = re.split(r'\[(TOOL_CALL|TOOL_RESPONSE|ANSWER)\]', msg.content)

            # 重新组装内容，根据消息位置决定处理方式
            filtered_content = ""
            current_tag = None
            is_recent_message = assistant_position >= cutoff_point  # 最近10条消息

            for i in range(0, len(parts)):
                if i % 2 == 0:  # 文本内容
                    text = parts[i].strip()
                    if not text:
                        continue

                    if current_tag == "TOOL_RESPONSE":
                        if is_recent_message:
                            # 最近10条ASSISTANT消息：保留完整TOOL_RESPONSE信息（使用简略模式）
                            if len(text) <= 500:
                                filtered_content += f"[TOOL_RESPONSE]\n{text}\n"
                            else:
                                # 截取前中后3段内容，每段250字
                                first_part = text[:250]
                                middle_start = len(text) // 2 - 125
                                middle_part = text[middle_start:middle_start + 250]
                                last_part = text[-250:]

                                # 计算省略的字数
                                omitted_count = len(text) - 750
                                omitted_text = f"...此处省略{omitted_count}字..."

                                # 拼接内容
                                truncated_text = f"{first_part}\n{omitted_text}\n{middle_part}\n{omitted_text}\n{last_part}"
                                filtered_content += f"[TOOL_RESPONSE]\n{truncated_text}\n"
                        # 10条以上的消息：不保留TOOL_RESPONSE数据（完全跳过）
                    elif current_tag == "TOOL_CALL":
                        if is_recent_message:
                            # 最近10条ASSISTANT消息：保留TOOL_CALL信息
                            filtered_content += f"[TOOL_CALL]\n{text}\n"
                        # 10条以上的消息：不保留TOOL_CALL数据（完全跳过）
                    elif current_tag == "ANSWER":
                        # 所有ASSISTANT消息都保留ANSWER数据
                        filtered_content += f"[ANSWER]\n{text}\n"
                    else:
                        # 第一个标签之前的内容
                        filtered_content += text + "\n"
                else:  # 标签
                    current_tag = parts[i]

            # 取最终处理后的内容，去除首尾空白
            final_content = filtered_content.strip()
            if final_content:
                processed_messages.append({"role": msg.role, "content": final_content})
            else:
                # 如果处理后为空，使用原内容
                processed_messages.append({"role": msg.role, "content": msg.content})
        else:
            processed_messages.append({"role": msg.role, "content": msg.content})

    # 逆运算：将包含 [TOOL_RESPONSE] 的消息重新组装回 msg['role'] == 'function' 和 msg.get('function_call')
    # 这是 get_content_from_messages 的逆运算
    final_messages = []
    for msg in processed_messages:
        if msg["role"] == ASSISTANT and "[TOOL_RESPONSE]" in msg["content"]:
            # 分割消息内容
            parts = re.split(r'\[(TOOL_CALL|TOOL_RESPONSE|ANSWER)\]', msg["content"])

            current_tag = None
            assistant_content = ""
            function_calls = []
            tool_responses = []

            for i in range(0, len(parts)):
                if i % 2 == 0:  # 文本内容
                    text = parts[i].strip()
                    if not text:
                        continue

                    if current_tag == "TOOL_RESPONSE":
                        # 解析 TOOL_RESPONSE 格式：[TOOL_RESPONSE] function_name\ncontent
                        lines = text.split('\n', 1)
                        function_name = lines[0].strip() if lines else ""
                        response_content = lines[1].strip() if len(lines) > 1 else ""

                        tool_responses.append({
                            "role": FUNCTION,
                            "name": function_name,
                            "content": response_content
                        })
                    elif current_tag == "TOOL_CALL":
                        # 解析 TOOL_CALL 格式：[TOOL_CALL] function_name\narguments
                        lines = text.split('\n', 1)
                        function_name = lines[0].strip() if lines else ""
                        arguments = lines[1].strip() if len(lines) > 1 else ""

                        function_calls.append({
                            "name": function_name,
                            "arguments": arguments
                        })
                    elif current_tag == "ANSWER":
                        assistant_content += text + "\n"
                    else:
                        # 第一个标签之前的内容也属于 assistant
                        assistant_content += text + "\n"
                else:  # 标签
                    current_tag = parts[i]

            # 添加 assistant 消息（如果有内容）
            if assistant_content.strip() or function_calls:
                assistant_msg = {"role": ASSISTANT}
                if assistant_content.strip():
                    assistant_msg["content"] = assistant_content.strip()
                if function_calls:
                    # 如果有多个 function_call，只取第一个（兼容原有逻辑）
                    assistant_msg["function_call"] = function_calls[0]
                final_messages.append(assistant_msg)

            # 添加所有 tool_responses 作为 function 消息
            final_messages.extend(tool_responses)
        else:
            # 非 assistant 消息或不包含 [TOOL_RESPONSE] 的消息直接添加
            final_messages.append(msg)

    return final_messages


def format_messages_to_chat_history(messages: List[Dict[str, str]]) -> str:
    """将messages格式化为纯文本聊天记录

    Args:
        messages: 消息列表

    Returns:
        str: 格式化的聊天记录
    """
    chat_history = []

    for message in messages:
        role = message.get('role', '')
        content = message.get('content', '')

        if role == 'user':
            chat_history.append(f"user: {content}")
        elif role == 'assistant':
            chat_history.append(f"assistant: {content}")
        # 忽略其他角色（如function等）

    return "\n".join(chat_history)


def create_project_directory(dataset_ids: Optional[List[str]], bot_id: str, robot_type: str = "general_agent") -> Optional[str]:
    """创建项目目录的公共逻辑"""
    # 只有当 robot_type == "catalog_agent" 且 dataset_ids 不为空时才创建目录
    if robot_type != "catalog_agent" or not dataset_ids or len(dataset_ids) == 0:
        return None

    try:
        from utils.multi_project_manager import create_robot_project
        return create_robot_project(dataset_ids, bot_id)
    except Exception as e:
        logger.error(f"Error creating project directory: {e}")
        return None


def extract_api_key_from_auth(authorization: Optional[str]) -> Optional[str]:
    """从Authorization header中提取API key"""
    if not authorization:
        return None

    # 移除 "Bearer " 前缀
    if authorization.startswith("Bearer "):
        return authorization[7:]
    else:
        return authorization


def generate_v2_auth_token(bot_id: str) -> str:
    """生成v2接口的认证token"""
    masterkey = os.getenv("MASTERKEY", "master")
    token_input = f"{masterkey}:{bot_id}"
    return hashlib.md5(token_input.encode()).hexdigest()


async def fetch_bot_config(bot_id: str) -> Dict[str, Any]:
    """获取机器人配置从后端API"""
    try:
        backend_host = os.getenv("BACKEND_HOST", "https://api-dev.gptbase.ai")
        url = f"{backend_host}/v1/agent_bot_config/{bot_id}"

        auth_token = generate_v2_auth_token(bot_id)
        headers = {
            "content-type": "application/json",
            "authorization": f"Bearer {auth_token}"
        }
        # 使用异步HTTP请求
        async with aiohttp.ClientSession() as session:
            async with session.get(url, headers=headers, timeout=30) as response:
                if response.status != 200:
                    raise HTTPException(
                        status_code=400,
                        detail=f"Failed to fetch bot config: API returned status code {response.status}"
                    )

                # 解析响应
                response_data = await response.json()

                if not response_data.get("success"):
                    raise HTTPException(
                        status_code=400,
                        detail=f"Failed to fetch bot config: {response_data.get('message', 'Unknown error')}"
                    )

                return response_data.get("data", {})

    except aiohttp.ClientError as e:
        raise HTTPException(
            status_code=500,
            detail=f"Failed to connect to backend API: {str(e)}"
        )
    except Exception as e:
        if isinstance(e, HTTPException):
            raise
        raise HTTPException(
            status_code=500,
            detail=f"Failed to fetch bot config: {str(e)}"
        )


def _sync_call_guideline_llm(llm_config, messages) -> str:
    """同步调用LLM的辅助函数，在线程池中执行"""
    llm_instance = TextChatAtOAI(llm_config)
    try:
        # 设置stream=False来获取非流式响应
        response = llm_instance.chat(messages=messages, stream=False)

        # 处理响应
        if isinstance(response, list) and response:
            # 如果返回的是Message列表，提取内容
            if hasattr(response[0], 'content'):
                return response[0].content
            elif isinstance(response[0], dict) and 'content' in response[0]:
                return response[0]['content']

        # 如果是字符串，直接返回
        if isinstance(response, str):
            return response

        # 处理其他类型
        return str(response) if response else ""

    except Exception as e:
        logger.error(f"Error calling guideline LLM: {e}")
        return ""


async def call_guideline_llm(chat_history: str, guidelines_text: str, terms:str, model_name: str, api_key: str, model_server: str) -> str:
    """调用大语言模型处理guideline分析

    Args:
        chat_history: 聊天历史记录
        guidelines_text: 指导原则文本
        model_name: 模型名称
        api_key: API密钥
        model_server: 模型服务器地址

    Returns:
        str: 模型响应结果
    """
    # 读取guideline提示词模板
    try:
        with open('./prompt/guideline_prompt.md', 'r', encoding='utf-8') as f:
            guideline_template = f.read()
    except Exception as e:
        logger.error(f"Error reading guideline prompt template: {e}")
        return ""

    # 替换模板中的占位符
    system_prompt = guideline_template.replace('{chat_history}', chat_history).replace('{guidelines_text}', guidelines_text).replace('{terms}', terms)

    # 配置LLM
    llm_config = {
        'model': model_name,
        'api_key': api_key,
        'model_server': model_server,  # 使用传入的model_server参数
    }

    # 调用模型
    messages = [{'role': 'user', 'content': system_prompt}]

    try:
        # 使用信号量控制并发API调用数量
        async with api_semaphore:
            # 使用线程池执行同步HTTP调用，避免阻塞事件循环
            loop = asyncio.get_event_loop()
            response = await loop.run_in_executor(thread_pool, _sync_call_guideline_llm, llm_config, messages)
            return response

    except Exception as e:
        logger.error(f"Error calling guideline LLM: {e}")
        return ""


def _get_optimal_batch_size(guidelines_count: int) -> int:
    """根据guidelines数量决定最优批次数量（并发数）"""
    if guidelines_count <= 5:
        return 1
    elif guidelines_count <= 10:
        return 2
    elif guidelines_count <= 20:
        return 3
    elif guidelines_count <= 30:
        return 4
    else:
        return 5


async def process_guideline_batch(
    guidelines_batch: List[str],
    chat_history: str,
    terms: str,
    model_name: str,
    api_key: str,
    model_server: str
) -> str:
    """处理单个guideline批次"""
    max_retries = 3

    for attempt in range(max_retries):
        try:
            # 调用LLM分析这批guidelines
            batch_guidelines_text = "\n".join(guidelines_batch)
            logger.info(f"Start processed guideline batch on attempt {attempt + 1}")
            batch_analysis = await call_guideline_llm(chat_history, batch_guidelines_text, terms, model_name, api_key, model_server)

            # 从响应中提取 ```json 和 ``` 包裹的内容
            json_pattern = r'```json\s*\n(.*?)\n```'
            json_matches = re.findall(json_pattern, batch_analysis, re.DOTALL)

            if json_matches:
                try:
                    # 解析第一个找到的JSON对象
                    json_data = json.loads(json_matches[0])
                    logger.info(f"Successfully processed guideline batch on attempt {attempt + 1}")
                    return json_data  # 返回解析后的JSON对象
                except json.JSONDecodeError as e:
                    logger.error(f"Error parsing JSON from guideline analysis on attempt {attempt + 1}: {e}")
                    if attempt == max_retries - 1:
                        return batch_analysis  # 最后一次尝试失败，返回原始文本
                    continue
            else:
                logger.warning(f"No JSON format found in guideline analysis on attempt {attempt + 1}")
                if attempt == max_retries - 1:
                    return batch_analysis  # 最后一次尝试失败，返回原始文本
                continue

        except Exception as e:
            logger.error(f"Error processing guideline batch on attempt {attempt + 1}: {e}")
            if attempt == max_retries - 1:
                return ""  # 最后一次尝试失败，返回空字符串

    # 这里不应该到达，但为了完整性
    return ""


def extract_block_from_system_prompt(system_prompt: Optional[str]) -> tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]:
    """
    从system prompt中提取guideline和terms内容

    Args:
        system_prompt: 系统提示词

    Returns:
        tuple[str, List[Dict], List[Dict]]: (清理后的system_prompt, guidelines_list, terms_list)
    """
    if not system_prompt:
        return "", [], []

    guidelines_list = []
    terms_list = []

    # 首先分割所有的代码块
    block_pattern = r'```(\w+)\s*\n(.*?)\n```'
    blocks_to_remove = []

    for match in re.finditer(block_pattern, system_prompt, re.DOTALL):
        block_type, content = match.groups()

        if block_type == 'guideline':
            try:
                guidelines = parse_guidelines_text(content.strip())
                guidelines_list.extend(guidelines)
                blocks_to_remove.append(match.group(0))
            except Exception as e:
                logger.error(f"Error parsing guidelines: {e}")

        elif block_type == 'terms':
            try:
                terms = parse_terms_text(content.strip())
                terms_list.extend(terms)
                blocks_to_remove.append(match.group(0))
            except Exception as e:
                logger.error(f"Error parsing terms: {e}")

    # 从system_prompt中移除这些已解析的块
    cleaned_prompt = system_prompt
    for block in blocks_to_remove:
        cleaned_prompt = cleaned_prompt.replace(block, '', 1)

    # 清理多余的空行
    cleaned_prompt = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_prompt).strip()

    return cleaned_prompt, guidelines_list, terms_list


def parse_guidelines_text(text: str) -> List[Dict[str, Any]]:
    """
    解析guidelines文本，支持多种格式

    Args:
        text: guidelines文本内容

    Returns:
        List[Dict]: guidelines列表
    """
    guidelines = []

    # 尝试解析JSON格式
    if text.strip().startswith('[') or text.strip().startswith('{'):
        try:
            data = json.loads(text)
            if isinstance(data, list):
                for item in data:
                    if isinstance(item, dict):
                        guidelines.append(item)
            elif isinstance(data, dict):
                guidelines.append(data)
            return guidelines
        except json.JSONDecodeError:
            pass

    # 解析行格式，支持多种分隔符
    lines = [line.strip() for line in text.split('\n') if line.strip()]

    for line in lines:
        # 跳过注释行
        if line.startswith('#') or line.startswith('//'):
            continue

        # 尝试解析 "id) Condition: ... Action: ..." 格式
        id_condition_action_pattern = r'(\d+)\)\s*Condition:\s*(.*?)\s*Action:\s*(.*?)(?:\s*Priority:\s*(\d+))?$'
        match = re.match(id_condition_action_pattern, line, re.IGNORECASE)
        if match:
            guidelines.append({
                'id': int(match.group(1)),
                'condition': match.group(2).strip(),
                'action': match.group(3).strip(),
                'priority': int(match.group(4)) if match.group(4) else 1
            })
            continue

        # 尝试解析 "condition -> action" 格式
        arrow_pattern = r'(?:\d+\)\s*)?(.*?)\s*->\s*(.*?)(?:\s*\[(\d+)\])?$'
        match = re.match(arrow_pattern, line, re.IGNORECASE)
        if match:
            guidelines.append({
                'id': len(guidelines) + 1,
                'condition': match.group(1).strip(),
                'action': match.group(2).strip(),
                'priority': int(match.group(3)) if match.group(3) else 1
            })
            continue

        # 尝试解析 "if condition then action" 格式
        if_then_pattern = r'(?:\d+\)\s*)?if\s+(.*?)\s+then\s+(.*?)(?:\s*\[(\d+)\])?$'
        match = re.match(if_then_pattern, line, re.IGNORECASE)
        if match:
            guidelines.append({
                'id': len(guidelines) + 1,
                'condition': match.group(1).strip(),
                'action': match.group(2).strip(),
                'priority': int(match.group(3)) if match.group(3) else 1
            })
            continue

        # 默认格式：整行作为action，condition为空
        guidelines.append({
            'id': len(guidelines) + 1,
            'condition': '',
            'action': line.strip(),
            'priority': 1
        })

    return guidelines


def parse_terms_text(text: str) -> List[Dict[str, Any]]:
    """
    解析terms文本，支持多种格式

    Args:
        text: terms文本内容

    Returns:
        List[Dict]: terms列表
    """
    terms = []

    # 尝试解析JSON格式
    if text.strip().startswith('[') or text.strip().startswith('{'):
        try:
            data = json.loads(text)
            if isinstance(data, list):
                for item in data:
                    if isinstance(item, dict):
                        terms.append(item)
            elif isinstance(data, dict):
                terms.append(data)
            return terms
        except json.JSONDecodeError:
            pass

    # 解析行格式，支持多种分隔符
    lines = [line.strip() for line in text.split('\n') if line.strip()]

    current_term = {}

    for line in lines:
        # 跳过注释行
        if line.startswith('#') or line.startswith('//'):
            continue

        # 尝试解析 "1) Name: term_name1, Description: desc, Synonyms: syn1, syn2" 格式
        numbered_term_pattern = r'(?:\d+\)\s*)?Name:\s*([^,]+)(?:,\s*Description:\s*([^,]+))?(?:,\s*Synonyms:\s*(.+))?'
        match = re.match(numbered_term_pattern, line, re.IGNORECASE)
        if match:
            name = match.group(1).strip()
            description = match.group(2).strip() if match.group(2) else ''
            synonyms_text = match.group(3).strip() if match.group(3) else ''

            # 构建term对象
            term_data = {'name': name}
            if description:
                term_data['description'] = description
            if synonyms_text:
                synonyms = re.split(r'[,;|]', synonyms_text)
                term_data['synonyms'] = [s.strip() for s in synonyms if s.strip()]

            if current_term:  # 保存之前的term
                terms.append(current_term)
            current_term = term_data
            continue

        # 尝试解析 "| value" 格式（简化格式）
        if line.startswith('|'):
            parts = [p.strip() for p in line[1:].split('|', 2)]  # 最多分割3部分
            if len(parts) >= 1:
                if current_term:
                    terms.append(current_term)
                current_term = {'name': parts[0]}
                if len(parts) >= 2:
                    current_term['description'] = parts[1]
                if len(parts) >= 3:
                    synonyms = re.split(r'[,;|]', parts[2])
                    current_term['synonyms'] = [s.strip() for s in synonyms if s.strip()]
            continue

    # 添加最后一个term
    if current_term:
        terms.append(current_term)

    return terms