qwen_agent/routes/skill_manager.py

import os
import re
import shutil
import zipfile
import logging
import asyncio
from typing import List, Optional
from fastapi import APIRouter, HTTPException, Query, UploadFile, File, Form
from pydantic import BaseModel
from utils.settings import SKILLS_DIR
import aiofiles

logger = logging.getLogger('app')

router = APIRouter()


class SkillItem(BaseModel):
    name: str
    description: str
    user_skill: bool = False


class SkillListResponse(BaseModel):
    skills: List[SkillItem]
    total: int


# ============ 安全常量 ============
MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB 最大上传文件大小
MAX_UNCOMPRESSED_SIZE = 500 * 1024 * 1024  # 500MB 解压后最大大小
MAX_COMPRESSION_RATIO = 100  # 最大压缩比例 100:1
MAX_ZIP_ENTRIES = 1000  # zip 文件中最多文件数量


def validate_bot_id(bot_id: str) -> str:
    """验证 bot_id 格式，防止路径遍历攻击"""
    if not bot_id:
        raise HTTPException(status_code=400, detail="bot_id 不能为空")

    # 检查路径遍历字符
    if '..' in bot_id or '/' in bot_id or '\\' in bot_id:
        raise HTTPException(status_code=400, detail="bot_id 包含非法字符")

    # 验证 UUID 格式（可选，根据实际需求）
    uuid_pattern = r'^[a-fA-F0-9-]{36}$'
    if not re.match(uuid_pattern, bot_id):
        logger.warning(f"bot_id 格式可能无效: {bot_id}")

    return bot_id


async def validate_upload_file_size(file: UploadFile) -> int:
    """验证上传文件大小，返回实际文件大小"""
    file_size = 0
    chunk_size = 8192

    # 保存当前位置以便后续重置
    await file.seek(0)

    while chunk := await file.read(chunk_size):
        file_size += len(chunk)
        if file_size > MAX_FILE_SIZE:
            await file.seek(0)  # 重置文件指针
            raise HTTPException(
                status_code=413,
                detail=f"文件过大，最大允许 {MAX_FILE_SIZE // (1024*1024)}MB"
            )

    await file.seek(0)  # 重置文件指针供后续使用
    return file_size


async def safe_extract_zip(zip_path: str, extract_dir: str) -> None:
    """安全地解压 zip 文件，防止 ZipSlip 和 zip 炸弹攻击

    Args:
        zip_path: zip 文件路径
        extract_dir: 解压目标目录

    Raises:
        HTTPException: 如果检测到恶意文件
    """
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # 检查文件数量
            file_list = zip_ref.infolist()
            if len(file_list) > MAX_ZIP_ENTRIES:
                raise zipfile.BadZipFile(f"zip 文件包含过多文件: {len(file_list)}")

            # 检查压缩比例和总大小
            compressed_size = sum(z.file_size for z in file_list)
            uncompressed_size = sum(z.compress_size for z in file_list)

            if uncompressed_size > MAX_UNCOMPRESSED_SIZE:
                raise zipfile.BadZipFile(
                    f"解压后大小 {uncompressed_size // (1024*1024)}MB 超过限制 "
                    f"{MAX_UNCOMPRESSED_SIZE // (1024*1024)}MB"
                )

            # 检查压缩比例（防止 zip 炸弹）
            if compressed_size > 0:
                ratio = uncompressed_size / compressed_size
                if ratio > MAX_COMPRESSION_RATIO:
                    raise zipfile.BadZipFile(
                        f"压缩比例 {ratio:.1f}:1 超过限制 {MAX_COMPRESSION_RATIO}:1，"
                        f"可能是 zip 炸弹攻击"
                    )

            # 规范化目标目录路径
            extract_dir_real = os.path.realpath(extract_dir)

            # 安全地解压每个文件
            for zip_info in file_list:
                # 检查路径遍历攻击
                if '..' in zip_info.filename or zip_info.filename.startswith('/'):
                    raise zipfile.BadZipFile(
                        f"检测到路径遍历攻击: {zip_info.filename}"
                    )

                # 构建完整的目标路径
                target_path = os.path.realpath(os.path.join(extract_dir, zip_info.filename))

                # 确保目标路径在解压目录内
                if not target_path.startswith(extract_dir_real + os.sep):
                    if target_path != extract_dir_real:  # 允许目录本身
                        raise zipfile.BadZipFile(
                            f"文件将被解压到目标目录之外: {zip_info.filename}"
                        )

                # 检查符号链接
                if zip_info.is_symlink():
                    raise zipfile.BadZipFile(
                        f"不允许符号链接: {zip_info.filename}"
                    )

                # 解压文件（使用线程池避免阻塞）
                await asyncio.to_thread(zip_ref.extract, zip_info, extract_dir)

    except zipfile.BadZipFile as e:
        raise HTTPException(status_code=400, detail=f"无效的 zip 文件: {str(e)}")


async def save_upload_file_async(file: UploadFile, destination: str) -> None:
    """异步保存上传文件到目标路径"""
    async with aiofiles.open(destination, 'wb') as f:
        chunk_size = 8192
        while chunk := await file.read(chunk_size):
            await f.write(chunk)


def parse_skill_frontmatter(skill_md_path: str) -> Optional[dict]:
    """Parse the YAML frontmatter from SKILL.md file

    Args:
        skill_md_path: Path to the SKILL.md file

    Returns:
        dict with 'name' and 'description' if found, None otherwise
    """
    try:
        with open(skill_md_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Match YAML frontmatter between --- delimiters
        frontmatter_match = re.match(r'^---\s*\n(.*?)\n---', content, re.DOTALL)
        if not frontmatter_match:
            logger.warning(f"No frontmatter found in {skill_md_path}")
            return None

        frontmatter = frontmatter_match.group(1)
        metadata = {}

        # Parse key: value pairs from frontmatter
        for line in frontmatter.split('\n'):
            line = line.strip()
            if ':' in line:
                key, value = line.split(':', 1)
                metadata[key.strip()] = value.strip()

        # Return name and description if both exist
        if 'name' in metadata and 'description' in metadata:
            return {
                'name': metadata['name'],
                'description': metadata['description']
            }

        logger.warning(f"Missing name or description in {skill_md_path}")
        return None

    except Exception as e:
        logger.error(f"Error parsing {skill_md_path}: {e}")
        return None


def get_official_skills(base_dir: str) -> List[SkillItem]:
    """Get all official skills from the skills directory

    Args:
        base_dir: Base directory of the project

    Returns:
        List of SkillItem objects
    """
    skills = []
    # Use SKILLS_DIR from settings, relative to base_dir
    if os.path.isabs(SKILLS_DIR):
        official_skills_dir = SKILLS_DIR
    else:
        official_skills_dir = os.path.join(base_dir, SKILLS_DIR)

    if not os.path.exists(official_skills_dir):
        logger.warning(f"Official skills directory not found: {official_skills_dir}")
        return skills

    for skill_name in os.listdir(official_skills_dir):
        skill_path = os.path.join(official_skills_dir, skill_name)
        if os.path.isdir(skill_path):
            skill_md_path = os.path.join(skill_path, 'SKILL.md')
            if os.path.exists(skill_md_path):
                metadata = parse_skill_frontmatter(skill_md_path)
                if metadata:
                    skills.append(SkillItem(
                        name=metadata['name'],
                        description=metadata['description'],
                        user_skill=False
                    ))
                    logger.debug(f"Found official skill: {metadata['name']}")

    return skills


def get_user_skills(base_dir: str, bot_id: str) -> List[SkillItem]:
    """Get all user uploaded skills for a specific bot

    Args:
        base_dir: Base directory of the project
        bot_id: Bot ID to look up user skills for

    Returns:
        List of SkillItem objects
    """
    skills = []
    user_skills_dir = os.path.join(base_dir, 'projects', 'uploads', bot_id, 'skills')

    if not os.path.exists(user_skills_dir):
        logger.info(f"No user skills directory found for bot {bot_id}: {user_skills_dir}")
        return skills

    for skill_name in os.listdir(user_skills_dir):
        skill_path = os.path.join(user_skills_dir, skill_name)
        if os.path.isdir(skill_path):
            skill_md_path = os.path.join(skill_path, 'SKILL.md')
            if os.path.exists(skill_md_path):
                metadata = parse_skill_frontmatter(skill_md_path)
                if metadata:
                    skills.append(SkillItem(
                        name=metadata['name'],
                        description=metadata['description'],
                        user_skill=True
                    ))
                    logger.debug(f"Found user skill: {metadata['name']}")

    return skills


@router.get("/api/v1/skill/list", response_model=SkillListResponse)
async def list_skills(
    bot_id: str = Query(..., description="Bot ID to fetch user skills for")
):
    """
    Get list of all available skills (official + user uploaded)

    Args:
        bot_id: Bot ID to fetch user uploaded skills for

    Returns:
        SkillListResponse containing all skills

    Notes:
        - Official skills are read from the /skills directory
        - User skills are read from /projects/uploads/{bot_id}/skills directory
        - User skills are marked with user_skill: true
    """
    try:
        # Get the project base directory
        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

        # Get official skills
        official_skills = get_official_skills(base_dir)

        # Get user skills for the specific bot
        user_skills = get_user_skills(base_dir, bot_id)

        # Combine both lists (user skills first)
        all_skills = user_skills + official_skills

        logger.info(f"Found {len(official_skills)} official skills and {len(user_skills)} user skills for bot {bot_id}")

        return SkillListResponse(
            skills=all_skills,
            total=len(all_skills)
        )

    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        logger.error(f"Error in list_skills: {str(e)}")
        logger.error(f"Full traceback: {error_details}")
        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")


@router.post("/api/v1/skill/upload")
async def upload_skill(file: UploadFile = File(...), bot_id: Optional[str] = Form(None)):
    """
    Skill文件上传API接口，上传zip文件到 ./projects/uploads/ 目录下并自动解压

    安全改进:
        - P1-001: ZipSlip 路径遍历防护 - 检查每个文件的解压路径
        - P1-004: 文件大小限制 - 最大 50MB
        - P1-005: Zip 炸弹防护 - 检查压缩比例（最大 100:1）和解压后大小（最大 500MB）
        - P1-008: 异步 I/O - 使用 aiofiles 和 asyncio.to_thread

    Args:
        file: 上传的zip文件
        bot_id: Bot ID，用于创建用户专属的skills目录

    Returns:
        dict: 包含文件路径、解压信息的响应

    Notes:
        - 仅支持.zip格式的skill文件
        - 上传后会自动解压到 projects/uploads/{bot_id}/skills/{skill_name}/ 目录
        - 文件大小限制: 50MB
        - 解压后大小限制: 500MB
    """
    file_path = None  # 初始化以便在异常处理中使用

    try:
        # 验证 bot_id (P1-006 路径遍历防护)
        if not bot_id:
            raise HTTPException(status_code=400, detail="bot_id 不能为空")
        bot_id = validate_bot_id(bot_id)

        # 验证文件名
        if not file.filename:
            raise HTTPException(status_code=400, detail="文件名不能为空")

        logger.info(f"Skill upload - bot_id: {bot_id}, filename: {file.filename}")

        # 验证是否为zip文件
        original_filename = file.filename
        name_without_ext, file_extension = os.path.splitext(original_filename)

        if file_extension.lower() != '.zip':
            raise HTTPException(status_code=400, detail="仅支持上传.zip格式的skill文件")

        # P1-004: 验证文件大小（异步读取，不阻塞事件循环）
        file_size = await validate_upload_file_size(file)
        logger.info(f"File size: {file_size // 1024}KB")

        folder_name = name_without_ext

        # 创建上传目录
        upload_dir = os.path.join("projects", "uploads", bot_id, "skill_zip")
        extract_target = os.path.join("projects", "uploads", bot_id, "skills", folder_name)

        # 使用线程池避免阻塞
        await asyncio.to_thread(os.makedirs, extract_target, exist_ok=True)
        await asyncio.to_thread(os.makedirs, upload_dir, exist_ok=True)

        # 保存zip文件路径
        file_path = os.path.join(upload_dir, original_filename)

        # P1-008: 异步保存文件（使用 aiofiles，不阻塞事件循环）
        await save_upload_file_async(file, file_path)
        logger.info(f"Saved zip file: {file_path}")

        # P1-001, P1-005: 安全解压（防止 ZipSlip 和 zip 炸弹）
        await safe_extract_zip(file_path, extract_target)
        logger.info(f"Extracted to: {extract_target}")

        return {
            "success": True,
            "message": f"Skill文件上传并解压成功",
            "file_path": file_path,
            "extract_path": extract_target,
            "original_filename": original_filename,
            "skill_name": folder_name
        }

    except HTTPException:
        # 清理已上传的文件
        if file_path and os.path.exists(file_path):
            try:
                await asyncio.to_thread(os.remove, file_path)
                logger.info(f"Cleaned up file: {file_path}")
            except Exception as cleanup_error:
                logger.error(f"Failed to cleanup file: {cleanup_error}")
        raise

    except Exception as e:
        # 清理已上传的文件
        if file_path and os.path.exists(file_path):
            try:
                await asyncio.to_thread(os.remove, file_path)
                logger.info(f"Cleaned up file: {file_path}")
            except Exception as cleanup_error:
                logger.error(f"Failed to cleanup file: {cleanup_error}")

        logger.error(f"Error uploading skill file: {str(e)}")
        # 不暴露详细错误信息给客户端（安全考虑）
        raise HTTPException(status_code=500, detail="Skill文件上传失败")