catalog-agent/fastapi_app.py

import json
import os
import aiofiles
import aiohttp
import hashlib
from typing import AsyncGenerator, Dict, List, Optional, Union

import uvicorn
from fastapi import FastAPI, HTTPException, Depends, Header
from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from qwen_agent.llm.schema import ASSISTANT, FUNCTION


# 自定义版本，不需要text参数，不打印到终端
def get_content_from_messages(messages: List[dict]) -> str:
    full_text = ''
    content = []
    TOOL_CALL_S = '[TOOL_CALL]'
    TOOL_RESULT_S = '[TOOL_RESPONSE]'
    THOUGHT_S = '[THINK]'
    ANSWER_S = '[ANSWER]'

    for msg in messages:
        if msg['role'] == ASSISTANT:
            if msg.get('reasoning_content'):
                assert isinstance(msg['reasoning_content'], str), 'Now only supports text messages'
                content.append(f'{THOUGHT_S}\n{msg["reasoning_content"]}')
            if msg.get('content'):
                assert isinstance(msg['content'], str), 'Now only supports text messages'
                content.append(f'{ANSWER_S}\n{msg["content"]}')
            if msg.get('function_call'):
                content.append(f'{TOOL_CALL_S} {msg["function_call"]["name"]}\n{msg["function_call"]["arguments"]}')
        elif msg['role'] == FUNCTION:
            content.append(f'{TOOL_RESULT_S} {msg["name"]}\n{msg["content"]}')
        else:
            raise TypeError
    if content:
        full_text = '\n'.join(content)

    return full_text

from file_loaded_agent_manager import get_global_agent_manager, init_global_agent_manager
from gbase_agent import update_agent_llm


async def download_file(url: str, destination_path: str) -> bool:
    """Download file from URL to destination path"""
    try:
        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                if response.status == 200:
                    async with aiofiles.open(destination_path, 'wb') as f:
                        async for chunk in response.content.iter_chunked(8192):
                            await f.write(chunk)
                    return True
                else:
                    print(f"Failed to download file from {url}, status: {response.status}")
                    return False
    except Exception as e:
        print(f"Error downloading file from {url}: {str(e)}")
        return False


def get_file_hash(file_path: str) -> str:
    """Generate MD5 hash for a file path/URL"""
    return hashlib.md5(file_path.encode('utf-8')).hexdigest()

def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
    """Load processed files log for a project"""
    log_file = os.path.join("projects", unique_id, "processed_files.json")
    if os.path.exists(log_file):
        try:
            with open(log_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            print(f"Error loading processed files log: {str(e)}")
    return {}

def save_processed_files_log(unique_id: str, processed_log: Dict[str, Dict]):
    """Save processed files log for a project"""
    log_file = os.path.join("projects", unique_id, "processed_files.json")
    try:
        os.makedirs(os.path.dirname(log_file), exist_ok=True)
        with open(log_file, 'w', encoding='utf-8') as f:
            json.dump(processed_log, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"Error saving processed files log: {str(e)}")

def remove_file_or_directory(path: str):
    """Remove file or directory if it exists"""
    if os.path.exists(path):
        try:
            if os.path.isdir(path):
                import shutil
                shutil.rmtree(path)
                print(f"Removed directory: {path}")
            else:
                os.remove(path)
                print(f"Removed file: {path}")
            return True
        except Exception as e:
            print(f"Error removing {path}: {str(e)}")
    return False

def remove_dataset_directory(unique_id: str, filename_without_ext: str):
    """Remove the entire dataset directory for a specific file"""
    dataset_dir = os.path.join("projects", unique_id, "dataset", filename_without_ext)
    if remove_file_or_directory(dataset_dir):
        print(f"Removed dataset directory: {dataset_dir}")
        return True
    return False

def get_document_preview(document_path: str, max_lines: int = 10) -> str:
    """Get preview of document content (first max_lines lines)"""
    try:
        with open(document_path, 'r', encoding='utf-8') as f:
            lines = []
            for i, line in enumerate(f):
                if i >= max_lines:
                    break
                lines.append(line.rstrip())
            return '\n'.join(lines)
    except Exception as e:
        print(f"Error reading document preview from {document_path}: {str(e)}")
        return f"Error reading document: {str(e)}"

def generate_dataset_structure(unique_id: str) -> str:
    """Generate dataset directory structure as a string"""
    dataset_dir = os.path.join("projects", unique_id, "dataset")
    structure_lines = []

    def build_tree(path: str, prefix: str = "", is_last: bool = True):
        try:
            items = sorted(os.listdir(path))
            items = [item for item in items if not item.startswith('.')]  # Hide hidden files

            for i, item in enumerate(items):
                item_path = os.path.join(path, item)
                is_dir = os.path.isdir(item_path)

                # Determine tree symbols
                if i == len(items) - 1:
                    current_prefix = "└── " if is_last else "├── "
                    next_prefix = "    " if is_last else "│   "
                else:
                    current_prefix = "├── "
                    next_prefix = "│   "

                line = prefix + current_prefix + item
                if is_dir:
                    line += "/"
                structure_lines.append(line)

                # Recursively process subdirectories
                if is_dir:
                    build_tree(item_path, prefix + next_prefix, i == len(items) - 1)

        except Exception as e:
            print(f"Error building tree for {path}: {str(e)}")

    structure_lines.append("dataset/")
    if os.path.exists(dataset_dir):
        build_tree(dataset_dir)
    else:
        structure_lines.append("    (empty)")

    return '\n'.join(structure_lines)

def generate_project_readme(unique_id: str) -> str:
    """Generate README.md content for a project"""
    project_dir = os.path.join("projects", unique_id)
    dataset_dir = os.path.join(project_dir, "dataset")

    readme_content = f"""# Project: {unique_id}

## Dataset Structure

```
{generate_dataset_structure(unique_id)}
```

## Files Description

"""

    if not os.path.exists(dataset_dir):
        readme_content += "No dataset files available.\n"
    else:
        # Get all document directories
        doc_dirs = []
        try:
            for item in sorted(os.listdir(dataset_dir)):
                item_path = os.path.join(dataset_dir, item)
                if os.path.isdir(item_path):
                    doc_dirs.append(item)
        except Exception as e:
            print(f"Error listing dataset directories: {str(e)}")

        if not doc_dirs:
            readme_content += "No document directories found.\n"
        else:
            for doc_dir in doc_dirs:
                doc_path = os.path.join(dataset_dir, doc_dir)
                document_file = os.path.join(doc_path, "document.txt")
                pagination_file = os.path.join(doc_path, "pagination.txt")
                embeddings_file = os.path.join(doc_path, "document_embeddings.pkl")

                readme_content += f"### {doc_dir}\n\n"
                readme_content += f"**Files:**\n"
                readme_content += f"- `document.txt`"
                if os.path.exists(document_file):
                    readme_content += " ✓"
                readme_content += "\n"

                readme_content += f"- `pagination.txt`"
                if os.path.exists(pagination_file):
                    readme_content += " ✓"
                readme_content += "\n"

                readme_content += f"- `document_embeddings.pkl`"
                if os.path.exists(embeddings_file):
                    readme_content += " ✓"
                readme_content += "\n\n"

                # Add document preview
                if os.path.exists(document_file):
                    readme_content += f"**Content Preview (first 10 lines):**\n\n```\n"
                    preview = get_document_preview(document_file, 10)
                    readme_content += preview
                    readme_content += "\n```\n\n"
                else:
                    readme_content += f"**Content Preview:** Not available\n\n"

    readme_content += f"""---
*Generated on {__import__('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*
"""

    return readme_content

def save_project_readme(unique_id: str):
    """Generate and save README.md for a project"""
    try:
        readme_content = generate_project_readme(unique_id)
        readme_path = os.path.join("projects", unique_id, "README.md")

        with open(readme_path, 'w', encoding='utf-8') as f:
            f.write(readme_content)

        print(f"Generated README.md for project {unique_id}")
        return readme_path
    except Exception as e:
        print(f"Error generating README for project {unique_id}: {str(e)}")
        return None

async def download_dataset_files(unique_id: str, files: List[str]) -> List[str]:
    """Download or copy dataset files to projects/{unique_id}/files directory with processing state management"""
    if not files:
        return []

    # Load existing processed files log
    processed_log = load_processed_files_log(unique_id)
    files_dir = os.path.join("projects", unique_id, "files")

    # Convert files list to a set for easy comparison
    new_files_hashes = {get_file_hash(file_path): file_path for file_path in files}
    existing_files_hashes = set(processed_log.keys())

    # Files to process (new or modified)
    files_to_process = []
    # Files to remove (no longer in the list)
    files_to_remove = existing_files_hashes - set(new_files_hashes.keys())

    processed_files = []

    # Remove files that are no longer in the list
    for file_hash in files_to_remove:
        file_info = processed_log[file_hash]

        # Remove local file in files directory
        if 'local_path' in file_info:
            remove_file_or_directory(file_info['local_path'])

        # Remove the entire dataset directory for this file
        if 'filename' in file_info:
            filename_without_ext = os.path.splitext(file_info['filename'])[0]
            remove_dataset_directory(unique_id, filename_without_ext)

        # Also remove any specific dataset path if exists (fallback)
        if 'dataset_path' in file_info:
            remove_file_or_directory(file_info['dataset_path'])

        # Remove from log
        del processed_log[file_hash]
        print(f"Removed file from processing: {file_info.get('original_path', 'unknown')}")

    # Process new files
    for file_path in files:
        file_hash = get_file_hash(file_path)

        # Check if file was already processed
        if file_hash in processed_log:
            file_info = processed_log[file_hash]
            if 'local_path' in file_info and os.path.exists(file_info['local_path']):
                processed_files.append(file_info['local_path'])
                print(f"Skipped already processed file: {file_path}")
                continue

        # Extract filename from URL or path
        filename = file_path.split("/")[-1]
        if not filename:
            filename = f"file_{len(processed_files)}"

        destination_path = os.path.join(files_dir, filename)

        # Check if it's a URL (remote file) or local file
        success = False
        if file_path.startswith(('http://', 'https://')):
            # Download remote file
            success = await download_file(file_path, destination_path)
        else:
            # Copy local file
            try:
                import shutil
                os.makedirs(files_dir, exist_ok=True)
                shutil.copy2(file_path, destination_path)
                success = True
                print(f"Copied local file: {file_path} -> {destination_path}")
            except Exception as e:
                print(f"Failed to copy local file {file_path}: {str(e)}")

        if success:
            processed_files.append(destination_path)
            # Update processed log
            processed_log[file_hash] = {
                'original_path': file_path,
                'local_path': destination_path,
                'filename': filename,
                'processed_at': str(__import__('datetime').datetime.now()),
                'file_type': 'remote' if file_path.startswith(('http://', 'https://')) else 'local'
            }
            print(f"Successfully processed file: {file_path}")
        else:
            print(f"Failed to process file: {file_path}")

    # After downloading/copying files, organize them into dataset structure
    if processed_files:
        try:
            from organize_dataset_files import organize_single_project_files

            # Update dataset paths in the log after organization
            old_processed_log = processed_log.copy()
            organize_single_project_files(unique_id, skip_processed=True)

            # Try to update dataset paths in the log
            for file_hash, file_info in old_processed_log.items():
                if 'local_path' in file_info and os.path.exists(file_info['local_path']):
                    # Construct expected dataset path based on known structure
                    filename_without_ext = os.path.splitext(file_info['filename'])[0]
                    dataset_path = os.path.join("projects", unique_id, "dataset", filename_without_ext, "document.txt")
                    if os.path.exists(dataset_path):
                        processed_log[file_hash]['dataset_path'] = dataset_path

            print(f"Organized files for project {unique_id} into dataset structure (skipping already processed files)")
        except Exception as e:
            print(f"Failed to organize files for project {unique_id}: {str(e)}")

    # Save the updated processed log
    save_processed_files_log(unique_id, processed_log)

    # Generate README.md after processing files
    try:
        save_project_readme(unique_id)
    except Exception as e:
        print(f"Failed to generate README for project {unique_id}: {str(e)}")

    return processed_files


# 全局助手管理器配置
max_cached_agents = int(os.getenv("MAX_CACHED_AGENTS", "20"))

# 初始化全局助手管理器
agent_manager = init_global_agent_manager(max_cached_agents=max_cached_agents)

app = FastAPI(title="Database Assistant API", version="1.0.0")

# 挂载public文件夹为静态文件服务
app.mount("/public", StaticFiles(directory="public"), name="static")

# 添加CORS中间件，支持前端页面
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # 在生产环境中应该设置为具体的前端域名
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


class Message(BaseModel):
    role: str
    content: str


class DatasetRequest(BaseModel):
    system_prompt: Optional[str] = None
    mcp_settings: Optional[List[Dict]] = None
    files: Optional[List[str]] = None
    unique_id: Optional[str] = None


class ChatRequest(BaseModel):
    messages: List[Message]
    model: str = "qwen3-next"
    model_server: str = ""
    unique_id: Optional[str] = None
    stream: Optional[bool] = False

    class Config:
        extra = 'allow'


class ChatResponse(BaseModel):
    choices: List[Dict]
    usage: Optional[Dict] = None


class ChatStreamResponse(BaseModel):
    choices: List[Dict]
    usage: Optional[Dict] = None


async def generate_stream_response(agent, messages, request) -> AsyncGenerator[str, None]:
    """生成流式响应"""
    accumulated_content = ""
    chunk_id = 0
    try:
        for response in agent.run(messages=messages):
            previous_content = accumulated_content
            accumulated_content = get_content_from_messages(response)

            # 计算新增的内容
            if accumulated_content.startswith(previous_content):
                new_content = accumulated_content[len(previous_content):]
            else:
                new_content = accumulated_content
                previous_content = ""

            # 只有当有新内容时才发送chunk
            if new_content:
                chunk_id += 1
                # 构造OpenAI格式的流式响应
                chunk_data = {
                    "id": f"chatcmpl-{chunk_id}",
                    "object": "chat.completion.chunk",
                    "created": int(__import__('time').time()),
                    "model": request.model,
                    "choices": [{
                        "index": 0,
                        "delta": {
                            "content": new_content
                        },
                        "finish_reason": None
                    }]
                }

                yield f"data: {json.dumps(chunk_data, ensure_ascii=False)}\n\n"

        # 发送最终完成标记
        final_chunk = {
            "id": f"chatcmpl-{chunk_id + 1}",
            "object": "chat.completion.chunk",
            "created": int(__import__('time').time()),
            "model": request.model,
            "choices": [{
                "index": 0,
                "delta": {},
                "finish_reason": "stop"
            }]
        }
        yield f"data: {json.dumps(final_chunk, ensure_ascii=False)}\n\n"

        # 发送结束标记
        yield "data: [DONE]\n\n"

    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"Error in generate_stream_response: {str(e)}")
        print(f"Full traceback: {error_details}")

        error_data = {
            "error": {
                "message": f"Stream error: {str(e)}",
                "type": "internal_error"
            }
        }
        yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n"


class FileProcessRequest(BaseModel):
    unique_id: str
    files: Optional[List[str]] = None
    system_prompt: Optional[str] = None
    mcp_settings: Optional[List[Dict]] = None

    class Config:
        extra = 'allow'


class FileProcessResponse(BaseModel):
    success: bool
    message: str
    unique_id: str
    processed_files: List[str]


@app.post("/api/v1/files/process")
async def process_files(request: FileProcessRequest, authorization: Optional[str] = Header(None)):
    """
    Process dataset files for a given unique_id

    Args:
        request: FileProcessRequest containing unique_id, files, system_prompt, and mcp_settings
        authorization: Authorization header containing API key (Bearer <API_KEY>)

    Returns:
        FileProcessResponse: Processing result with file list
    """
    try:

        unique_id = request.unique_id
        if not unique_id:
            raise HTTPException(status_code=400, detail="unique_id is required")

        # 处理文件：只使用request.files
        processed_files = []
        if request.files:
            # 使用请求中的文件
            processed_files = await download_dataset_files(unique_id, request.files)
            print(f"Processed {len(processed_files)} dataset files for unique_id: {unique_id}")
        else:
            print(f"No files provided in request for unique_id: {unique_id}")

        # 使用unique_id获取项目目录
        project_dir = os.path.join("projects", unique_id)
        if not os.path.exists(project_dir):
            raise HTTPException(status_code=400, detail=f"Project directory not found for unique_id: {unique_id}")

        # 收集项目目录下所有的 document.txt 文件
        document_files = []
        for root, dirs, files in os.walk(project_dir):
            for file in files:
                if file == "document.txt":
                    document_files.append(os.path.join(root, file))

        # 合并所有处理的文件
        all_files = document_files + processed_files

        if not all_files:
            print(f"警告: 项目目录 {project_dir} 中未找到任何 document.txt 文件")

        # 保存system_prompt和mcp_settings到项目目录（如果提供）
        if request.system_prompt:
            system_prompt_file = os.path.join(project_dir, "system_prompt.md")
            with open(system_prompt_file, 'w', encoding='utf-8') as f:
                f.write(request.system_prompt)
            print(f"Saved system_prompt for unique_id: {unique_id}")

        if request.mcp_settings:
            mcp_settings_file = os.path.join(project_dir, "mcp_settings.json")
            with open(mcp_settings_file, 'w', encoding='utf-8') as f:
                json.dump(request.mcp_settings, f, ensure_ascii=False, indent=2)
            print(f"Saved mcp_settings for unique_id: {unique_id}")

        return FileProcessResponse(
            success=True,
            message=f"Successfully processed {len(all_files)} files",
            unique_id=unique_id,
            processed_files=all_files
        )

    except HTTPException:
        raise
    except Exception as e:
        print(f"Error processing files: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")


@app.post("/api/v1/chat/completions")
async def chat_completions(request: ChatRequest, authorization: Optional[str] = Header(None)):
    """
    Chat completions API similar to OpenAI, supports both streaming and non-streaming

    Args:
        request: ChatRequest containing messages, model, dataset with unique_id, system_prompt, mcp_settings, and files
        authorization: Authorization header containing API key (Bearer <API_KEY>)

    Returns:
        Union[ChatResponse, StreamingResponse]: Chat completion response or stream
    """
    try:
        # 从Authorization header中提取API key
        api_key = None
        if authorization:
            # 移除 "Bearer " 前缀
            if authorization.startswith("Bearer "):
                api_key = authorization[7:]
            else:
                api_key = authorization

        # 获取unique_id
        unique_id = request.unique_id
        if not unique_id:
            raise HTTPException(status_code=400, detail="unique_id is required")

        # 使用unique_id获取项目目录
        project_dir = os.path.join("projects", unique_id)
        if not os.path.exists(project_dir):
            raise HTTPException(status_code=400, detail=f"Project directory not found for unique_id: {unique_id}")

        # 收集额外参数作为 generate_cfg
        exclude_fields = {'messages', 'model', 'model_server', 'unique_id', 'stream'}
        generate_cfg = {k: v for k, v in request.model_dump().items() if k not in exclude_fields}

        # 从全局管理器获取或创建助手实例（配置读取逻辑已在agent_manager内部处理）
        agent = await agent_manager.get_or_create_agent(
            unique_id=unique_id,
            project_dir=project_dir,
            model_name=request.model,
            api_key=api_key,
            model_server=request.model_server,
            generate_cfg=generate_cfg
        )
        # 构建包含项目信息的消息上下文
        messages = []
        for msg in request.messages:
            if msg.role == "assistant":
                # 对assistant消息进行[ANSWER]分割处理，只保留最后一段
                content_parts = msg.content.split("[ANSWER]")
                if content_parts:
                    # 取最后一段非空文本
                    last_part = content_parts[-1].strip()
                    messages.append({"role": msg.role, "content": last_part})
                else:
                    messages.append({"role": msg.role, "content": msg.content})
            else:
                messages.append({"role": msg.role, "content": msg.content})

        # 根据stream参数决定返回流式还是非流式响应
        if request.stream:
            return StreamingResponse(
                generate_stream_response(agent, messages, request),
                media_type="text/event-stream",
                headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
            )
        else:
            # 非流式响应
            final_responses = agent.run_nonstream(messages)

            if final_responses and len(final_responses) > 0:
                # 取最后一个响应
                final_response = final_responses[-1]

                # 如果返回的是Message对象，需要转换为字典
                if hasattr(final_response, 'model_dump'):
                    final_response = final_response.model_dump()
                elif hasattr(final_response, 'dict'):
                    final_response = final_response.dict()

                content = final_response.get("content", "")

                # 构造OpenAI格式的响应
                return ChatResponse(
                    choices=[{
                        "index": 0,
                        "message": {
                            "role": "assistant",
                            "content": content
                        },
                        "finish_reason": "stop"
                    }],
                    usage={
                        "prompt_tokens": sum(len(msg.content) for msg in request.messages),
                        "completion_tokens": len(content),
                        "total_tokens": sum(len(msg.content) for msg in request.messages) + len(content)
                    }
                )
            else:
                raise HTTPException(status_code=500, detail="No response from agent")

    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"Error in chat_completions: {str(e)}")
        print(f"Full traceback: {error_details}")
        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")


@app.get("/api/health")
async def health_check():
    """Health check endpoint"""
    return {"message": "Database Assistant API is running"}


@app.get("/system/status")
async def system_status():
    """获取系统状态信息"""
    # 获取助手缓存统计
    cache_stats = agent_manager.get_cache_stats()

    return {
        "status": "running",
        "storage_type": "File-Loaded Agent Manager",
        "max_cached_agents": max_cached_agents,
        "agent_cache": {
            "total_cached_agents": cache_stats["total_cached_agents"],
            "max_cached_agents": cache_stats["max_cached_agents"],
            "cached_agents": cache_stats["agents"]
        }
    }


@app.post("/system/cleanup-cache")
async def cleanup_cache():
    """清理助手缓存"""
    try:
        # 清理助手实例缓存
        cleared_count = agent_manager.clear_cache()

        return {
            "message": "缓存清理成功",
            "cleared_agent_instances": cleared_count
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"缓存清理失败: {str(e)}")


@app.post("/system/cleanup-agent-cache")
async def cleanup_agent_cache():
    """仅清理助手实例缓存"""
    try:
        cleared_count = agent_manager.clear_cache()
        return {
            "message": "助手实例缓存清理成功",
            "cleared_agent_instances": cleared_count
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"助手实例缓存清理失败: {str(e)}")


@app.get("/system/cached-projects")
async def get_cached_projects():
    """获取所有缓存的项目信息"""
    try:
        cache_stats = agent_manager.get_cache_stats()

        return {
            "cache_stats": cache_stats
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"获取缓存项目信息失败: {str(e)}")


@app.post("/system/remove-project-cache")
async def remove_project_cache(unique_id: str):
    """移除特定项目的缓存"""
    try:
        success = agent_manager.remove_cache_by_unique_id(unique_id)
        if success:
            return {"message": f"项目缓存移除成功: {unique_id}"}
        else:
            return {"message": f"未找到项目缓存: {unique_id}", "removed": False}
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"移除项目缓存失败: {str(e)}")


@app.get("/api/v1/files/{unique_id}/status")
async def get_files_processing_status(unique_id: str):
    """获取项目的文件处理状态"""
    try:
        # Load processed files log
        processed_log = load_processed_files_log(unique_id)

        # Get project directory info
        project_dir = os.path.join("projects", unique_id)
        project_exists = os.path.exists(project_dir)

        # Collect document.txt files
        document_files = []
        if project_exists:
            for root, dirs, files in os.walk(project_dir):
                for file in files:
                    if file == "document.txt":
                        document_files.append(os.path.join(root, file))

        return {
            "unique_id": unique_id,
            "project_exists": project_exists,
            "processed_files_count": len(processed_log),
            "processed_files": processed_log,
            "document_files_count": len(document_files),
            "document_files": document_files,
            "log_file_exists": os.path.exists(os.path.join("projects", unique_id, "processed_files.json"))
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"获取文件处理状态失败: {str(e)}")


@app.post("/api/v1/files/{unique_id}/reset")
async def reset_files_processing(unique_id: str):
    """重置项目的文件处理状态，删除处理日志和所有文件"""
    try:
        project_dir = os.path.join("projects", unique_id)
        log_file = os.path.join("projects", unique_id, "processed_files.json")

        # Load processed log to know what files to remove
        processed_log = load_processed_files_log(unique_id)

        removed_files = []
        # Remove all processed files and their dataset directories
        for file_hash, file_info in processed_log.items():
            # Remove local file in files directory
            if 'local_path' in file_info:
                if remove_file_or_directory(file_info['local_path']):
                    removed_files.append(file_info['local_path'])

            # Remove the entire dataset directory for this file
            if 'filename' in file_info:
                filename_without_ext = os.path.splitext(file_info['filename'])[0]
                dataset_dir = os.path.join("projects", unique_id, "dataset", filename_without_ext)
                if remove_file_or_directory(dataset_dir):
                    removed_files.append(dataset_dir)

            # Also remove any specific dataset path if exists (fallback)
            if 'dataset_path' in file_info:
                if remove_file_or_directory(file_info['dataset_path']):
                    removed_files.append(file_info['dataset_path'])

        # Remove the log file
        if remove_file_or_directory(log_file):
            removed_files.append(log_file)

        # Remove the entire files directory
        files_dir = os.path.join(project_dir, "files")
        if remove_file_or_directory(files_dir):
            removed_files.append(files_dir)

        # Also remove the entire dataset directory (clean up any remaining files)
        dataset_dir = os.path.join(project_dir, "dataset")
        if remove_file_or_directory(dataset_dir):
            removed_files.append(dataset_dir)

        # Remove README.md if exists
        readme_file = os.path.join(project_dir, "README.md")
        if remove_file_or_directory(readme_file):
            removed_files.append(readme_file)

        return {
            "message": f"文件处理状态重置成功: {unique_id}",
            "removed_files_count": len(removed_files),
            "removed_files": removed_files
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"重置文件处理状态失败: {str(e)}")


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8001)