add symlink

2025-12-23 17:36:45 +08:00 · 2025-12-23 17:36:45 +08:00 · d3465d418e
commit d3465d418e
parent 61c6b69aa5
9 changed files with 103 additions and 302 deletions
--- a/agent/prompt_loader.py
+++ b/agent/prompt_loader.py
@ -98,9 +98,7 @@ async def load_system_prompt_async(project_dir: str, language: str = None, syste
    datetime_str = format_datetime_by_language(language) if language else format_datetime_by_language('en')

    # 如果存在{language} 占位符，那么就直接使用 system_prompt
-    if robot_type == "deep_agent":
-        return None
-    elif robot_type == "general_agent" or robot_type == "catalog_agent":
+    if robot_type == "general_agent" or robot_type == "catalog_agent" or robot_type == "deep_agent":
        """
        优先使用项目目录的README.md，没有才使用默认的system_prompt_{robot_type}.md
        """
@ -122,7 +120,15 @@ async def load_system_prompt_async(project_dir: str, language: str = None, syste
            readme_path = os.path.join(project_dir, "README.md")
            readme = await config_cache.get_text_file(readme_path) or ""

-        prompt = system_prompt_default.format(readme=str(readme), extra_prompt=system_prompt or "",language=language_display, user_identifier=user_identifier, datetime=datetime_str)
+        agent_dir_path = f"~/.deepagents/{bot_id}" #agent_dir_path 其实映射的就是 project_dir目录，只是给ai看的目录路径
+        prompt = system_prompt_default.format(
+            readme=str(readme), 
+            extra_prompt=system_prompt or "",
+            language=language_display, 
+            user_identifier=user_identifier, 
+            datetime=datetime_str,
+            agent_dir_path=agent_dir_path
+        )
    elif system_prompt:
        prompt = system_prompt.format(language=language_display, user_identifier=user_identifier, datetime=datetime_str)
    return prompt or ""
--- a/prompt/system_prompt_deep_agent.md
+++ b/prompt/system_prompt_deep_agent.md
@ -0,0 +1,62 @@
+<env>
+Working directory: {agent_dir_path}
+</env>
+
+### Current Working Directory
+
+The filesystem backend is currently operating in: `{agent_dir_path}`
+
+### File System and Paths
+
+**IMPORTANT - Path Handling:**
+- All file paths must be absolute paths (e.g., `{agent_dir_path}/file.txt`)
+- Use the working directory from <env> to construct absolute paths
+- Example: To create a file in your working directory, use `{agent_dir_path}/research_project/file.md`
+- Never use relative paths - always construct full absolute paths
+
+### Skills Directory
+
+Your skills are stored at: `{agent_dir_path}/skills/`
+Skills may contain scripts or supporting files. When executing skill scripts with bash, use the real filesystem path:
+Example: `bash python {agent_dir_path}/skills/web-research/script.py`
+
+### Human-in-the-Loop Tool Approval
+
+Some tool calls require user approval before execution. When a tool call is rejected by the user:
+1. Accept their decision immediately - do NOT retry the same command
+2. Explain that you understand they rejected the action
+3. Suggest an alternative approach or ask for clarification
+4. Never attempt the exact same rejected command again
+
+Respect the user's decisions and work with them collaboratively.
+
+### Web Search Tool Usage
+
+When you use the web_search tool:
+1. The tool will return search results with titles, URLs, and content excerpts
+2. You MUST read and process these results, then respond naturally to the user
+3. NEVER show raw JSON or tool results directly to the user
+4. Synthesize the information from multiple sources into a coherent answer
+5. Cite your sources by mentioning page titles or URLs when relevant
+6. If the search doesn't find what you need, explain what you found and ask clarifying questions
+
+The user only sees your text responses - not tool results. Always provide a complete, natural language answer after using web_search.
+
+### Todo List Management
+
+When using the write_todos tool:
+1. Keep the todo list MINIMAL - aim for 3-6 items maximum
+2. Only create todos for complex, multi-step tasks that truly need tracking
+3. Break down work into clear, actionable items without over-fragmenting
+4. For simple tasks (1-2 steps), just do them directly without creating todos
+5. When first creating a todo list for a task, ALWAYS ask the user if the plan looks good before starting work
+   - Create the todos, let them render, then ask: "Does this plan look good?" or similar
+   - Wait for the user's response before marking the first todo as in_progress
+   - If they want changes, adjust the plan accordingly
+6. Update todo status promptly as you complete each item
+
+The todo list is a planning tool - use it judiciously to avoid overwhelming the user with excessive task tracking.
+
+## System Information
+- **Current User**: {user_identifier}
+- **Current Time**: {datetime}
--- a/task_queue/tasks.py
+++ b/task_queue/tasks.py
@ -19,7 +19,6 @@ from .config import huey
 from utils.file_utils import (
    extract_zip_file,
    get_file_hash,
-    is_file_already_processed,
    load_processed_files_log,
    save_processed_files_log,
    get_document_preview
--- a/utils/init.py
+++ b/utils/init.py
@ -9,7 +9,6 @@ from .file_utils import (
    remove_file_or_directory,
    extract_zip_file,
    get_document_preview,
-    is_file_already_processed,
    load_processed_files_log,
    save_processed_files_log
 )
@ -44,11 +43,6 @@ from .agent_pool import (
    release_agent_to_pool
 )

-from .organize_dataset_files import (
-    is_file_already_processed,
-    organize_single_project_files,
-    organize_dataset_files
-)

 from .api_models import (
    Message,
@ -77,8 +71,6 @@ from .api_models import (

 from .multi_project_manager import (
    create_robot_project,
-    get_robot_project_info,
-    cleanup_robot_project,
    get_unique_folder_name,
    copy_dataset_folder,
    generate_robot_readme
@ -96,7 +88,6 @@ __all__ = [
    'remove_file_or_directory',
    'extract_zip_file',
    'get_document_preview',
-    'is_file_already_processed',
    'load_processed_files_log',
    'save_processed_files_log',
    
@ -122,10 +113,6 @@ __all__ = [
    'get_agent_from_pool',
    'release_agent_to_pool',
    
-    # organize_dataset_files
-    'is_file_already_processed',
-    'organize_single_project_files',
-    'organize_dataset_files',
    
    # api_models
    'Message',
@ -152,8 +139,6 @@ __all__ = [

    # multi_project_manager
    'create_robot_project',
-    'get_robot_project_info',
-    'cleanup_robot_project',
    'get_unique_folder_name',
    'copy_dataset_folder',
    'generate_robot_readme',
--- a/utils/fastapi_utils.py
+++ b/utils/fastapi_utils.py
@ -373,7 +373,8 @@ def create_project_directory(dataset_ids: Optional[List[str]], bot_id: str, robo

    try:
        from utils.multi_project_manager import create_robot_project
-        return create_robot_project(dataset_ids, bot_id)
+        from pathlib import Path
+        return create_robot_project(dataset_ids, bot_id, Path("~", ".deepagents"))
    except Exception as e:
        logger.error(f"Error creating project directory: {e}")
        return None
--- a/utils/file_utils.py
+++ b/utils/file_utils.py
@ -91,18 +91,6 @@ def get_document_preview(document_path: str, max_lines: int = 10) -> str:
        return f"Error reading document: {str(e)}"


-def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
-    """Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
-    if not target_file.exists():
-        return False
-    
-    # Check if pagination and embeddings files exist and are not empty
-    if pagination_file.exists() and embeddings_file.exists():
-        # Check file sizes to ensure they're not empty
-        if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
-            return True
-    
-    return False


 def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
--- a/utils/multi_project_manager.py
+++ b/utils/multi_project_manager.py
@ -141,7 +141,7 @@ def get_unique_folder_name(target_dir: Path, original_name: str) -> str:
        counter += 1


-def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder_name: str) -> Dict:
+def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder_name: str, project_path: Path) -> Dict:
    """
    复制单个项目的dataset文件夹到目标目录
    
@ -149,6 +149,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
        source_project_id: 源项目ID
        target_dataset_dir: 目标dataset目录
        folder_name: 要复制的文件夹名称
+        project_path: 项目路径
        
    Returns:
        Dict: 复制结果
@ -163,7 +164,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
    }
    
    try:
-        source_folder = Path("projects") / "data" / source_project_id / "dataset" / folder_name
+        source_folder = project_path / "data" / source_project_id / "dataset" / folder_name
        result["source_path"] = str(source_folder)
        
        if not source_folder.exists():
@ -190,7 +191,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
    return result


-def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: List[Dict]) -> str:
+def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: List[Dict], project_path: Path) -> str:
    """
    生成机器人项目的README.md文件
    
@ -202,10 +203,10 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L
    Returns:
        str: README.md文件路径
    """
-    readme_path = Path("projects") / "robot" / robot_id / "README.md"
+    readme_path = project_path / "robot" / robot_id / "README.md"
    readme_path.parent.mkdir(parents=True, exist_ok=True)
    
-    robot_dir = Path("projects") / "robot" / robot_id
+    robot_dir = project_path / "robot" / robot_id
    
    # 统计信息
    total_folders = len(copy_results)
@ -300,7 +301,7 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L
    return str(readme_path)


-def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
+def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str, project_path: Path) -> bool:
    """
    检查是否需要重建机器人项目
    1. 检查机器人项目是否存在
@ -310,11 +311,12 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
    Args:
        dataset_ids: 源项目ID列表
        bot_id: 机器人ID
+        project_path: 项目路径
        
    Returns:
        bool: 是否需要重建
    """
-    robot_dir = Path("projects") / "robot" / bot_id
+    robot_dir = project_path / "robot" / bot_id
    
    # 如果机器人项目不存在，需要创建
    if not robot_dir.exists():
@ -356,7 +358,7 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
    
    # 检查每个源项目的processing_log.json文件
    for source_project_id in dataset_ids:
-        log_file = Path("projects") / "data" / source_project_id / "processing_log.json"
+        log_file = project_path / "data" / source_project_id / "processing_log.json"
        
        if not log_file.exists():
                logger.info(f"Processing log file not found for project {source_project_id}, will rebuild")
@ -373,7 +375,7 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
    return False


-def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: bool = False) -> str:
+def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: bool = False, project_path: Path = Path("projects")) -> str:
    """
    创建机器人项目，合并多个源项目的dataset文件夹
    
@ -386,15 +388,15 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
        str: 机器人项目目录路径
    """
    logger.info(f"Creating robot project: {bot_id} from sources: {dataset_ids}")
-    
+
    # 检查是否需要重建
-    if not force_rebuild and not should_rebuild_robot_project(dataset_ids, bot_id):
-        robot_dir = Path("projects") / "robot" / bot_id
+    if not force_rebuild and not should_rebuild_robot_project(dataset_ids, bot_id, project_path):
+        robot_dir = project_path / "robot" / bot_id
        logger.info(f"Using existing robot project: {robot_dir}")
        return str(robot_dir)
    
    # 创建机器人目录结构
-    robot_dir = Path("projects") / "robot" / bot_id
+    robot_dir = project_path / "robot" / bot_id
    dataset_dir = robot_dir / "dataset"
    
    # 清理已存在的目录（如果需要）
@ -411,7 +413,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
    for source_project_id in dataset_ids:
        logger.info(f"\nProcessing source project: {source_project_id}")
        
-        source_dataset_dir = Path("projects") / "data" / source_project_id / "dataset"
+        source_dataset_dir = project_path / "data" / source_project_id / "dataset"
        
        if not source_dataset_dir.exists():
            logger.warning(f"  Warning: Dataset directory not found for project {source_project_id}")
@ -426,7 +428,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
        
        # 复制每个文件夹
        for folder in folders:
-            result = copy_dataset_folder(source_project_id, dataset_dir, folder.name)
+            result = copy_dataset_folder(source_project_id, dataset_dir, folder.name, project_path)
            copy_results.append(result)
    
    # 保存配置信息
@ -442,7 +444,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
        json.dump(config_data, f, ensure_ascii=False, indent=2)
    
    # 生成README
-    readme_path = generate_robot_readme(bot_id, dataset_ids, copy_results)
+    readme_path = generate_robot_readme(bot_id, dataset_ids, copy_results, project_path)
    
    # 统计信息
    successful_copies = sum(1 for r in copy_results if r["success"])
@ -456,78 +458,6 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
    return str(robot_dir)


-def get_robot_project_info(bot_id: str) -> Dict:
-    """
-    获取机器人项目信息
-    
-    Args:
-        bot_id: 机器人ID
-        
-    Returns:
-        Dict: 机器人项目信息
-    """
-    robot_dir = Path("projects") / "robot" / bot_id
-    
-    if not robot_dir.exists():
-        return {
-            "exists": False,
-            "bot_id": bot_id,
-            "error": "Robot project does not exist"
-        }
-    
-    dataset_dir = robot_dir / "dataset"
-    readme_path = robot_dir / "README.md"
-    
-    # 统计文件夹数量
-    folder_count = 0
-    total_size = 0
-    if dataset_dir.exists():
-        for item in dataset_dir.iterdir():
-            if item.is_dir():
-                folder_count += 1
-                # 计算文件夹大小
-                for file_path in item.rglob('*'):
-                    if file_path.is_file():
-                        total_size += file_path.stat().st_size
-    
-    return {
-        "exists": True,
-        "bot_id": bot_id,
-        "robot_dir": str(robot_dir),
-        "dataset_dir": str(dataset_dir),
-        "readme_exists": readme_path.exists(),
-        "folder_count": folder_count,
-        "total_size_bytes": total_size,
-        "total_size_mb": round(total_size / (1024 * 1024), 2)
-    }
-
-
-def cleanup_robot_project(bot_id: str) -> bool:
-    """
-    清理机器人项目
-    
-    Args:
-        bot_id: 机器人ID
-        
-    Returns:
-        bool: 清理是否成功
-    """
-    try:
-        robot_dir = Path("projects") / "robot" / bot_id
-        
-        if robot_dir.exists():
-            shutil.rmtree(robot_dir)
-            logger.info(f"Cleaned up robot project: {bot_id}")
-            return True
-        else:
-            logger.info(f"Robot project does not exist: {bot_id}")
-            return True
-            
-    except Exception as e:
-        logger.error(f"Error cleaning up robot project {bot_id}: {str(e)}")
-        return False
-
-
 if __name__ == "__main__":
    # 测试代码
    test_dataset_ids = ["test-project-1", "test-project-2"]
@ -536,5 +466,3 @@ if __name__ == "__main__":
    robot_dir = create_robot_project(test_dataset_ids, test_bot_id)
    logger.info(f"Created robot project at: {robot_dir}")
    
-    info = get_robot_project_info(test_bot_id)
-    logger.info(f"Robot project info: {json.dumps(info, indent=2, ensure_ascii=False)}")
--- a/utils/organize_dataset_files.py
+++ b/utils/organize_dataset_files.py
@ -1,179 +0,0 @@
-#!/usr/bin/env python3
-import os
-import shutil
-import logging
-from pathlib import Path
-
-# 配置日志
-logger = logging.getLogger('app')
-
-def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
-    """Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
-    if not target_file.exists():
-        return False
-    
-    # Check if pagination and embeddings files exist and are not empty
-    if pagination_file.exists() and embeddings_file.exists():
-        # Check file sizes to ensure they're not empty
-        if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
-            return True
-    
-    return False
-
-def organize_single_project_files(unique_id: str, skip_processed=True):
-    """Organize files for a single project from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
-    
-    project_dir = Path("projects") / "data" / unique_id
-    
-    if not project_dir.exists():
-        logger.error(f"Project directory not found: {project_dir}")
-        return
-    
-    logger.info(f"Organizing files for project: {unique_id} (skip_processed={skip_processed})")
-    
-    files_dir = project_dir / "files"
-    dataset_dir = project_dir / "dataset"
-    
-    # Check if files directory exists and has files
-    if not files_dir.exists():
-        logger.info(f"  No files directory found, skipping...")
-        return
-        
-    files = list(files_dir.glob("*"))
-    if not files:
-        logger.info(f"  Files directory is empty, skipping...")
-        return
-        
-    # Create dataset directory if it doesn't exist
-    dataset_dir.mkdir(exist_ok=True)
-    
-    # Copy each file to its own directory
-    for file_path in files:
-        if file_path.is_file():
-            # Get filename without extension as directory name
-            file_name_without_ext = file_path.stem
-            target_dir = dataset_dir / file_name_without_ext
-            target_file = target_dir / "document.txt"
-            pagination_file = target_dir / "pagination.txt"
-            embeddings_file = target_dir / "embedding.pkl"
-            
-            # Check if file is already processed
-            if skip_processed and is_file_already_processed(target_file, pagination_file, embeddings_file):
-                logger.info(f"  Skipping already processed file: {file_path.name}")
-                continue
-            
-            logger.info(f"  Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
-            
-            # Create target directory
-            target_dir.mkdir(exist_ok=True)
-            
-            # Copy and rename file
-            shutil.copy2(str(file_path), str(target_file))
-            
-    print(f"  Files remain in original location (copied to dataset structure)")
-    
-    # Process each document.txt file: split pages and generate embeddings
-    if not skip_processed:
-        import sys
-        sys.path.append(os.path.join(os.path.dirname(__file__), 'embedding'))
-        
-        from embedding import split_document_by_pages, embed_document
-        
-        for file_path in files:
-            if file_path.is_file():
-                file_name_without_ext = file_path.stem
-                target_dir = dataset_dir / file_name_without_ext
-                document_file = target_dir / "document.txt"
-                pagination_file = target_dir / "pagination.txt"
-                embeddings_file = target_dir / "embedding.pkl"
-                
-                # Skip if already processed
-                if is_file_already_processed(document_file, pagination_file, embeddings_file):
-                    print(f"  Skipping document processing for already processed file: {file_path.name}")
-                    continue
-                
-                # Split document by pages
-                print(f"  Splitting pages for {document_file.name}")
-                try:
-                    pages = split_document_by_pages(str(document_file), str(pagination_file))
-                    print(f"    Generated {len(pages)} pages")
-                except Exception as e:
-                    print(f"    Failed to split pages: {e}")
-                    continue
-                
-                # Generate embeddings
-                print(f"  Generating embeddings for {document_file.name}")
-                try:
-                    # Use paragraph chunking strategy with default settings
-                    embedding_data = embed_document(
-                        str(document_file), 
-                        str(embeddings_file),
-                        chunking_strategy='paragraph'
-                    )
-                    
-                    if embedding_data:
-                        print(f"    Generated embeddings for {len(embedding_data['chunks'])} chunks")
-                    else:
-                        print(f"    Failed to generate embeddings")
-                except Exception as e:
-                    print(f"    Failed to generate embeddings: {e}")
-        
-        print(f"  Document processing completed for project {unique_id}")
-    else:
-        print(f"  Skipping document processing (skip_processed=True)")
-
-
-def organize_dataset_files():
-    """Move files from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
-    
-    projects_dir = Path("projects") / "data"
-    
-    if not projects_dir.exists():
-        print("Projects directory not found")
-        return
-    
-    # Get all project directories (exclude cache and other non-project dirs)
-    project_dirs = [d for d in projects_dir.iterdir() 
-                   if d.is_dir() and d.name != "_cache" and not d.name.startswith(".")]
-    
-    for project_dir in project_dirs:
-        print(f"\nProcessing project: {project_dir.name}")
-        
-        files_dir = project_dir / "files"
-        dataset_dir = project_dir / "dataset"
-        
-        # Check if files directory exists and has files
-        if not files_dir.exists():
-            logger.info(f"  No files directory found, skipping...")
-            continue
-            
-        files = list(files_dir.glob("*"))
-        if not files:
-            logger.info(f"  Files directory is empty, skipping...")
-            continue
-            
-        # Create dataset directory if it doesn't exist
-        dataset_dir.mkdir(exist_ok=True)
-        
-        # Move each file to its own directory
-        for file_path in files:
-            if file_path.is_file():
-                # Get filename without extension as directory name
-                file_name_without_ext = file_path.stem
-                target_dir = dataset_dir / file_name_without_ext
-                target_file = target_dir / "document.txt"
-                
-                logger.info(f"  Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
-                
-                # Create target directory
-                target_dir.mkdir(exist_ok=True)
-                
-                # Copy and rename file
-                shutil.copy2(str(file_path), str(target_file))
-                
-        print(f"  Files remain in original location (copied to dataset structure)")
-    
-    print("\nFile organization complete!")
-
-if __name__ == "__main__":
-    organize_dataset_files()
--- a/utils/symlink_utils.py
+++ b/utils/symlink_utils.py
@ -54,6 +54,17 @@ def setup_deepagents_symlink():
                logger.info(f"Removed existing symlink pointing to {target}")

        # Create the symbolic link
+        # Check again before creating to handle race conditions
+        if deepagents_dir.is_symlink() or deepagents_dir.exists():
+            logger.warning(f"Path {deepagents_dir} exists, attempting to remove before symlink")
+            if deepagents_dir.is_symlink():
+                deepagents_dir.unlink()
+            elif deepagents_dir.is_dir():
+                import shutil
+                shutil.rmtree(str(deepagents_dir))
+            else:
+                deepagents_dir.unlink()
+
        os.symlink(robot_dir, deepagents_dir, target_is_directory=True)
        logger.info(f"Created symbolic link: {deepagents_dir} -> {robot_dir}")
        return True