add symlink

2025-12-23 17:36:45 +08:00 · 2025-12-23 17:36:45 +08:00 · d3465d418e
commit d3465d418e
parent 61c6b69aa5
9 changed files with 103 additions and 302 deletions
--- a/agent/prompt_loader.py
+++ b/agent/prompt_loader.py
@ -98,9 +98,7 @@ async def load_system_prompt_async(project_dir: str, language: str = None, syste
    datetime_str = format_datetime_by_language(language) if language else format_datetime_by_language('en')
    # 如果存在{language} 占位符，那么就直接使用 system_prompt
-    if robot_type == "deep_agent":
+    if robot_type == "general_agent" or robot_type == "catalog_agent" or robot_type == "deep_agent":
        return None
    elif robot_type == "general_agent" or robot_type == "catalog_agent":
        """
        优先使用项目目录的README.md，没有才使用默认的system_prompt_{robot_type}.md
        """
@ -122,7 +120,15 @@ async def load_system_prompt_async(project_dir: str, language: str = None, syste
            readme_path = os.path.join(project_dir, "README.md")
            readme = await config_cache.get_text_file(readme_path) or ""
-        prompt = system_prompt_default.format(readme=str(readme), extra_prompt=system_prompt or "",language=language_display, user_identifier=user_identifier, datetime=datetime_str)
+        agent_dir_path = f"~/.deepagents/{bot_id}" #agent_dir_path 其实映射的就是 project_dir目录，只是给ai看的目录路径
        prompt = system_prompt_default.format(
            readme=str(readme), 
            extra_prompt=system_prompt or "",
            language=language_display, 
            user_identifier=user_identifier, 
            datetime=datetime_str,
            agent_dir_path=agent_dir_path
        )
    elif system_prompt:
        prompt = system_prompt.format(language=language_display, user_identifier=user_identifier, datetime=datetime_str)
    return prompt or ""
--- a/prompt/system_prompt_deep_agent.md
+++ b/prompt/system_prompt_deep_agent.md
@ -0,0 +1,62 @@
 <env>
 Working directory: {agent_dir_path}
 </env>
 ### Current Working Directory
 The filesystem backend is currently operating in: `{agent_dir_path}`
 ### File System and Paths
 **IMPORTANT - Path Handling:**
 - All file paths must be absolute paths (e.g., `{agent_dir_path}/file.txt`)
 - Use the working directory from <env> to construct absolute paths
 - Example: To create a file in your working directory, use `{agent_dir_path}/research_project/file.md`
 - Never use relative paths - always construct full absolute paths
 ### Skills Directory
 Your skills are stored at: `{agent_dir_path}/skills/`
 Skills may contain scripts or supporting files. When executing skill scripts with bash, use the real filesystem path:
 Example: `bash python {agent_dir_path}/skills/web-research/script.py`
 ### Human-in-the-Loop Tool Approval
 Some tool calls require user approval before execution. When a tool call is rejected by the user:
 1. Accept their decision immediately - do NOT retry the same command
 2. Explain that you understand they rejected the action
 3. Suggest an alternative approach or ask for clarification
 4. Never attempt the exact same rejected command again
 Respect the user's decisions and work with them collaboratively.
 ### Web Search Tool Usage
 When you use the web_search tool:
 1. The tool will return search results with titles, URLs, and content excerpts
 2. You MUST read and process these results, then respond naturally to the user
 3. NEVER show raw JSON or tool results directly to the user
 4. Synthesize the information from multiple sources into a coherent answer
 5. Cite your sources by mentioning page titles or URLs when relevant
 6. If the search doesn't find what you need, explain what you found and ask clarifying questions
 The user only sees your text responses - not tool results. Always provide a complete, natural language answer after using web_search.
 ### Todo List Management
 When using the write_todos tool:
 1. Keep the todo list MINIMAL - aim for 3-6 items maximum
 2. Only create todos for complex, multi-step tasks that truly need tracking
 3. Break down work into clear, actionable items without over-fragmenting
 4. For simple tasks (1-2 steps), just do them directly without creating todos
 5. When first creating a todo list for a task, ALWAYS ask the user if the plan looks good before starting work
   - Create the todos, let them render, then ask: "Does this plan look good?" or similar
   - Wait for the user's response before marking the first todo as in_progress
   - If they want changes, adjust the plan accordingly
 6. Update todo status promptly as you complete each item
 The todo list is a planning tool - use it judiciously to avoid overwhelming the user with excessive task tracking.
 ## System Information
 - **Current User**: {user_identifier}
 - **Current Time**: {datetime}
--- a/task_queue/tasks.py
+++ b/task_queue/tasks.py
@ -19,7 +19,6 @@ from .config import huey
 from utils.file_utils import (
    extract_zip_file,
    get_file_hash,
    is_file_already_processed,
    load_processed_files_log,
    save_processed_files_log,
    get_document_preview
--- a/utils/init.py
+++ b/utils/init.py
@ -9,7 +9,6 @@ from .file_utils import (
    remove_file_or_directory,
    extract_zip_file,
    get_document_preview,
    is_file_already_processed,
    load_processed_files_log,
    save_processed_files_log
 )
@ -44,11 +43,6 @@ from .agent_pool import (
    release_agent_to_pool
 )
 from .organize_dataset_files import (
    is_file_already_processed,
    organize_single_project_files,
    organize_dataset_files
 )
 from .api_models import (
    Message,
@ -77,8 +71,6 @@ from .api_models import (
 from .multi_project_manager import (
    create_robot_project,
    get_robot_project_info,
    cleanup_robot_project,
    get_unique_folder_name,
    copy_dataset_folder,
    generate_robot_readme
@ -96,7 +88,6 @@ __all__ = [
    'remove_file_or_directory',
    'extract_zip_file',
    'get_document_preview',
    'is_file_already_processed',
    'load_processed_files_log',
    'save_processed_files_log',
@ -122,10 +113,6 @@ __all__ = [
    'get_agent_from_pool',
    'release_agent_to_pool',
    # organize_dataset_files
    'is_file_already_processed',
    'organize_single_project_files',
    'organize_dataset_files',
    # api_models
    'Message',
@ -152,8 +139,6 @@ __all__ = [
    # multi_project_manager
    'create_robot_project',
    'get_robot_project_info',
    'cleanup_robot_project',
    'get_unique_folder_name',
    'copy_dataset_folder',
    'generate_robot_readme',
--- a/utils/fastapi_utils.py
+++ b/utils/fastapi_utils.py
@ -373,7 +373,8 @@ def create_project_directory(dataset_ids: Optional[List[str]], bot_id: str, robo
    try:
        from utils.multi_project_manager import create_robot_project
-        return create_robot_project(dataset_ids, bot_id)
+        from pathlib import Path
        return create_robot_project(dataset_ids, bot_id, Path("~", ".deepagents"))
    except Exception as e:
        logger.error(f"Error creating project directory: {e}")
        return None
--- a/utils/file_utils.py
+++ b/utils/file_utils.py
@ -91,18 +91,6 @@ def get_document_preview(document_path: str, max_lines: int = 10) -> str:
        return f"Error reading document: {str(e)}"
 def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
    """Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
    if not target_file.exists():
        return False
    # Check if pagination and embeddings files exist and are not empty
    if pagination_file.exists() and embeddings_file.exists():
        # Check file sizes to ensure they're not empty
        if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
            return True
    return False
 def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
--- a/utils/multi_project_manager.py
+++ b/utils/multi_project_manager.py
@ -141,7 +141,7 @@ def get_unique_folder_name(target_dir: Path, original_name: str) -> str:
        counter += 1
-def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder_name: str) -> Dict:
+def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder_name: str, project_path: Path) -> Dict:
    """
    复制单个项目的dataset文件夹到目标目录
@ -149,6 +149,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
        source_project_id: 源项目ID
        target_dataset_dir: 目标dataset目录
        folder_name: 要复制的文件夹名称
        project_path: 项目路径
    Returns:
        Dict: 复制结果
@ -163,7 +164,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
    }
    try:
-        source_folder = Path("projects") / "data" / source_project_id / "dataset" / folder_name
+        source_folder = project_path / "data" / source_project_id / "dataset" / folder_name
        result["source_path"] = str(source_folder)
        if not source_folder.exists():
@ -190,7 +191,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
    return result
-def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: List[Dict]) -> str:
+def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: List[Dict], project_path: Path) -> str:
    """
    生成机器人项目的README.md文件
@ -202,10 +203,10 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L
    Returns:
        str: README.md文件路径
    """
-    readme_path = Path("projects") / "robot" / robot_id / "README.md"
+    readme_path = project_path / "robot" / robot_id / "README.md"
    readme_path.parent.mkdir(parents=True, exist_ok=True)
-    robot_dir = Path("projects") / "robot" / robot_id
+    robot_dir = project_path / "robot" / robot_id
    # 统计信息
    total_folders = len(copy_results)
@ -300,7 +301,7 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L
    return str(readme_path)
-def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
+def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str, project_path: Path) -> bool:
    """
    检查是否需要重建机器人项目
    1. 检查机器人项目是否存在
@ -310,11 +311,12 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
    Args:
        dataset_ids: 源项目ID列表
        bot_id: 机器人ID
        project_path: 项目路径
    Returns:
        bool: 是否需要重建
    """
-    robot_dir = Path("projects") / "robot" / bot_id
+    robot_dir = project_path / "robot" / bot_id
    # 如果机器人项目不存在，需要创建
    if not robot_dir.exists():
@ -356,7 +358,7 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
    # 检查每个源项目的processing_log.json文件
    for source_project_id in dataset_ids:
-        log_file = Path("projects") / "data" / source_project_id / "processing_log.json"
+        log_file = project_path / "data" / source_project_id / "processing_log.json"
        if not log_file.exists():
                logger.info(f"Processing log file not found for project {source_project_id}, will rebuild")
@ -373,7 +375,7 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
    return False
-def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: bool = False) -> str:
+def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: bool = False, project_path: Path = Path("projects")) -> str:
    """
    创建机器人项目，合并多个源项目的dataset文件夹
@ -386,15 +388,15 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
        str: 机器人项目目录路径
    """
    logger.info(f"Creating robot project: {bot_id} from sources: {dataset_ids}")
-    
+
    # 检查是否需要重建
-    if not force_rebuild and not should_rebuild_robot_project(dataset_ids, bot_id):
+    if not force_rebuild and not should_rebuild_robot_project(dataset_ids, bot_id, project_path):
-        robot_dir = Path("projects") / "robot" / bot_id
+        robot_dir = project_path / "robot" / bot_id
        logger.info(f"Using existing robot project: {robot_dir}")
        return str(robot_dir)
    # 创建机器人目录结构
-    robot_dir = Path("projects") / "robot" / bot_id
+    robot_dir = project_path / "robot" / bot_id
    dataset_dir = robot_dir / "dataset"
    # 清理已存在的目录（如果需要）
@ -411,7 +413,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
    for source_project_id in dataset_ids:
        logger.info(f"\nProcessing source project: {source_project_id}")
-        source_dataset_dir = Path("projects") / "data" / source_project_id / "dataset"
+        source_dataset_dir = project_path / "data" / source_project_id / "dataset"
        if not source_dataset_dir.exists():
            logger.warning(f"  Warning: Dataset directory not found for project {source_project_id}")
@ -426,7 +428,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
        # 复制每个文件夹
        for folder in folders:
-            result = copy_dataset_folder(source_project_id, dataset_dir, folder.name)
+            result = copy_dataset_folder(source_project_id, dataset_dir, folder.name, project_path)
            copy_results.append(result)
    # 保存配置信息
@ -442,7 +444,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
        json.dump(config_data, f, ensure_ascii=False, indent=2)
    # 生成README
-    readme_path = generate_robot_readme(bot_id, dataset_ids, copy_results)
+    readme_path = generate_robot_readme(bot_id, dataset_ids, copy_results, project_path)
    # 统计信息
    successful_copies = sum(1 for r in copy_results if r["success"])
@ -456,78 +458,6 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
    return str(robot_dir)
 def get_robot_project_info(bot_id: str) -> Dict:
    """
    获取机器人项目信息
    Args:
        bot_id: 机器人ID
    Returns:
        Dict: 机器人项目信息
    """
    robot_dir = Path("projects") / "robot" / bot_id
    if not robot_dir.exists():
        return {
            "exists": False,
            "bot_id": bot_id,
            "error": "Robot project does not exist"
        }
    dataset_dir = robot_dir / "dataset"
    readme_path = robot_dir / "README.md"
    # 统计文件夹数量
    folder_count = 0
    total_size = 0
    if dataset_dir.exists():
        for item in dataset_dir.iterdir():
            if item.is_dir():
                folder_count += 1
                # 计算文件夹大小
                for file_path in item.rglob('*'):
                    if file_path.is_file():
                        total_size += file_path.stat().st_size
    return {
        "exists": True,
        "bot_id": bot_id,
        "robot_dir": str(robot_dir),
        "dataset_dir": str(dataset_dir),
        "readme_exists": readme_path.exists(),
        "folder_count": folder_count,
        "total_size_bytes": total_size,
        "total_size_mb": round(total_size / (1024 * 1024), 2)
    }
 def cleanup_robot_project(bot_id: str) -> bool:
    """
    清理机器人项目
    Args:
        bot_id: 机器人ID
    Returns:
        bool: 清理是否成功
    """
    try:
        robot_dir = Path("projects") / "robot" / bot_id
        if robot_dir.exists():
            shutil.rmtree(robot_dir)
            logger.info(f"Cleaned up robot project: {bot_id}")
            return True
        else:
            logger.info(f"Robot project does not exist: {bot_id}")
            return True
    except Exception as e:
        logger.error(f"Error cleaning up robot project {bot_id}: {str(e)}")
        return False
 if __name__ == "__main__":
    # 测试代码
    test_dataset_ids = ["test-project-1", "test-project-2"]
@ -536,5 +466,3 @@ if __name__ == "__main__":
    robot_dir = create_robot_project(test_dataset_ids, test_bot_id)
    logger.info(f"Created robot project at: {robot_dir}")
    info = get_robot_project_info(test_bot_id)
    logger.info(f"Robot project info: {json.dumps(info, indent=2, ensure_ascii=False)}")
--- a/utils/organize_dataset_files.py
+++ b/utils/organize_dataset_files.py
@ -1,179 +0,0 @@
 #!/usr/bin/env python3
 import os
 import shutil
 import logging
 from pathlib import Path
 # 配置日志
 logger = logging.getLogger('app')
 def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
    """Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
    if not target_file.exists():
        return False
    # Check if pagination and embeddings files exist and are not empty
    if pagination_file.exists() and embeddings_file.exists():
        # Check file sizes to ensure they're not empty
        if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
            return True
    return False
 def organize_single_project_files(unique_id: str, skip_processed=True):
    """Organize files for a single project from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
    project_dir = Path("projects") / "data" / unique_id
    if not project_dir.exists():
        logger.error(f"Project directory not found: {project_dir}")
        return
    logger.info(f"Organizing files for project: {unique_id} (skip_processed={skip_processed})")
    files_dir = project_dir / "files"
    dataset_dir = project_dir / "dataset"
    # Check if files directory exists and has files
    if not files_dir.exists():
        logger.info(f"  No files directory found, skipping...")
        return
    files = list(files_dir.glob("*"))
    if not files:
        logger.info(f"  Files directory is empty, skipping...")
        return
    # Create dataset directory if it doesn't exist
    dataset_dir.mkdir(exist_ok=True)
    # Copy each file to its own directory
    for file_path in files:
        if file_path.is_file():
            # Get filename without extension as directory name
            file_name_without_ext = file_path.stem
            target_dir = dataset_dir / file_name_without_ext
            target_file = target_dir / "document.txt"
            pagination_file = target_dir / "pagination.txt"
            embeddings_file = target_dir / "embedding.pkl"
            # Check if file is already processed
            if skip_processed and is_file_already_processed(target_file, pagination_file, embeddings_file):
                logger.info(f"  Skipping already processed file: {file_path.name}")
                continue
            logger.info(f"  Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
            # Create target directory
            target_dir.mkdir(exist_ok=True)
            # Copy and rename file
            shutil.copy2(str(file_path), str(target_file))
    print(f"  Files remain in original location (copied to dataset structure)")
    # Process each document.txt file: split pages and generate embeddings
    if not skip_processed:
        import sys
        sys.path.append(os.path.join(os.path.dirname(__file__), 'embedding'))
        from embedding import split_document_by_pages, embed_document
        for file_path in files:
            if file_path.is_file():
                file_name_without_ext = file_path.stem
                target_dir = dataset_dir / file_name_without_ext
                document_file = target_dir / "document.txt"
                pagination_file = target_dir / "pagination.txt"
                embeddings_file = target_dir / "embedding.pkl"
                # Skip if already processed
                if is_file_already_processed(document_file, pagination_file, embeddings_file):
                    print(f"  Skipping document processing for already processed file: {file_path.name}")
                    continue
                # Split document by pages
                print(f"  Splitting pages for {document_file.name}")
                try:
                    pages = split_document_by_pages(str(document_file), str(pagination_file))
                    print(f"    Generated {len(pages)} pages")
                except Exception as e:
                    print(f"    Failed to split pages: {e}")
                    continue
                # Generate embeddings
                print(f"  Generating embeddings for {document_file.name}")
                try:
                    # Use paragraph chunking strategy with default settings
                    embedding_data = embed_document(
                        str(document_file), 
                        str(embeddings_file),
                        chunking_strategy='paragraph'
                    )
                    if embedding_data:
                        print(f"    Generated embeddings for {len(embedding_data['chunks'])} chunks")
                    else:
                        print(f"    Failed to generate embeddings")
                except Exception as e:
                    print(f"    Failed to generate embeddings: {e}")
        print(f"  Document processing completed for project {unique_id}")
    else:
        print(f"  Skipping document processing (skip_processed=True)")
 def organize_dataset_files():
    """Move files from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
    projects_dir = Path("projects") / "data"
    if not projects_dir.exists():
        print("Projects directory not found")
        return
    # Get all project directories (exclude cache and other non-project dirs)
    project_dirs = [d for d in projects_dir.iterdir() 
                   if d.is_dir() and d.name != "_cache" and not d.name.startswith(".")]
    for project_dir in project_dirs:
        print(f"\nProcessing project: {project_dir.name}")
        files_dir = project_dir / "files"
        dataset_dir = project_dir / "dataset"
        # Check if files directory exists and has files
        if not files_dir.exists():
            logger.info(f"  No files directory found, skipping...")
            continue
        files = list(files_dir.glob("*"))
        if not files:
            logger.info(f"  Files directory is empty, skipping...")
            continue
        # Create dataset directory if it doesn't exist
        dataset_dir.mkdir(exist_ok=True)
        # Move each file to its own directory
        for file_path in files:
            if file_path.is_file():
                # Get filename without extension as directory name
                file_name_without_ext = file_path.stem
                target_dir = dataset_dir / file_name_without_ext
                target_file = target_dir / "document.txt"
                logger.info(f"  Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
                # Create target directory
                target_dir.mkdir(exist_ok=True)
                # Copy and rename file
                shutil.copy2(str(file_path), str(target_file))
        print(f"  Files remain in original location (copied to dataset structure)")
    print("\nFile organization complete!")
 if __name__ == "__main__":
    organize_dataset_files()
--- a/utils/symlink_utils.py
+++ b/utils/symlink_utils.py
@ -54,6 +54,17 @@ def setup_deepagents_symlink():
                logger.info(f"Removed existing symlink pointing to {target}")
        # Create the symbolic link
        # Check again before creating to handle race conditions
        if deepagents_dir.is_symlink() or deepagents_dir.exists():
            logger.warning(f"Path {deepagents_dir} exists, attempting to remove before symlink")
            if deepagents_dir.is_symlink():
                deepagents_dir.unlink()
            elif deepagents_dir.is_dir():
                import shutil
                shutil.rmtree(str(deepagents_dir))
            else:
                deepagents_dir.unlink()
        os.symlink(robot_dir, deepagents_dir, target_is_directory=True)
        logger.info(f"Created symbolic link: {deepagents_dir} -> {robot_dir}")
        return True