From d3465d418e485f5b4ec246a100ddfe974f85c900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Tue, 23 Dec 2025 17:36:45 +0800 Subject: [PATCH] add symlink --- agent/prompt_loader.py | 14 ++- prompt/system_prompt_deep_agent.md | 62 ++++++++++ task_queue/tasks.py | 1 - utils/__init__.py | 15 --- utils/fastapi_utils.py | 3 +- utils/file_utils.py | 12 -- utils/multi_project_manager.py | 108 +++-------------- utils/organize_dataset_files.py | 179 ----------------------------- utils/symlink_utils.py | 11 ++ 9 files changed, 103 insertions(+), 302 deletions(-) create mode 100644 prompt/system_prompt_deep_agent.md delete mode 100644 utils/organize_dataset_files.py diff --git a/agent/prompt_loader.py b/agent/prompt_loader.py index ddd2bfc..bbb76f2 100644 --- a/agent/prompt_loader.py +++ b/agent/prompt_loader.py @@ -98,9 +98,7 @@ async def load_system_prompt_async(project_dir: str, language: str = None, syste datetime_str = format_datetime_by_language(language) if language else format_datetime_by_language('en') # 如果存在{language} 占位符,那么就直接使用 system_prompt - if robot_type == "deep_agent": - return None - elif robot_type == "general_agent" or robot_type == "catalog_agent": + if robot_type == "general_agent" or robot_type == "catalog_agent" or robot_type == "deep_agent": """ 优先使用项目目录的README.md,没有才使用默认的system_prompt_{robot_type}.md """ @@ -122,7 +120,15 @@ async def load_system_prompt_async(project_dir: str, language: str = None, syste readme_path = os.path.join(project_dir, "README.md") readme = await config_cache.get_text_file(readme_path) or "" - prompt = system_prompt_default.format(readme=str(readme), extra_prompt=system_prompt or "",language=language_display, user_identifier=user_identifier, datetime=datetime_str) + agent_dir_path = f"~/.deepagents/{bot_id}" #agent_dir_path 其实映射的就是 project_dir目录,只是给ai看的目录路径 + prompt = system_prompt_default.format( + readme=str(readme), + extra_prompt=system_prompt or "", + language=language_display, + user_identifier=user_identifier, + datetime=datetime_str, + agent_dir_path=agent_dir_path + ) elif system_prompt: prompt = system_prompt.format(language=language_display, user_identifier=user_identifier, datetime=datetime_str) return prompt or "" diff --git a/prompt/system_prompt_deep_agent.md b/prompt/system_prompt_deep_agent.md new file mode 100644 index 0000000..3ec5258 --- /dev/null +++ b/prompt/system_prompt_deep_agent.md @@ -0,0 +1,62 @@ + +Working directory: {agent_dir_path} + + +### Current Working Directory + +The filesystem backend is currently operating in: `{agent_dir_path}` + +### File System and Paths + +**IMPORTANT - Path Handling:** +- All file paths must be absolute paths (e.g., `{agent_dir_path}/file.txt`) +- Use the working directory from to construct absolute paths +- Example: To create a file in your working directory, use `{agent_dir_path}/research_project/file.md` +- Never use relative paths - always construct full absolute paths + +### Skills Directory + +Your skills are stored at: `{agent_dir_path}/skills/` +Skills may contain scripts or supporting files. When executing skill scripts with bash, use the real filesystem path: +Example: `bash python {agent_dir_path}/skills/web-research/script.py` + +### Human-in-the-Loop Tool Approval + +Some tool calls require user approval before execution. When a tool call is rejected by the user: +1. Accept their decision immediately - do NOT retry the same command +2. Explain that you understand they rejected the action +3. Suggest an alternative approach or ask for clarification +4. Never attempt the exact same rejected command again + +Respect the user's decisions and work with them collaboratively. + +### Web Search Tool Usage + +When you use the web_search tool: +1. The tool will return search results with titles, URLs, and content excerpts +2. You MUST read and process these results, then respond naturally to the user +3. NEVER show raw JSON or tool results directly to the user +4. Synthesize the information from multiple sources into a coherent answer +5. Cite your sources by mentioning page titles or URLs when relevant +6. If the search doesn't find what you need, explain what you found and ask clarifying questions + +The user only sees your text responses - not tool results. Always provide a complete, natural language answer after using web_search. + +### Todo List Management + +When using the write_todos tool: +1. Keep the todo list MINIMAL - aim for 3-6 items maximum +2. Only create todos for complex, multi-step tasks that truly need tracking +3. Break down work into clear, actionable items without over-fragmenting +4. For simple tasks (1-2 steps), just do them directly without creating todos +5. When first creating a todo list for a task, ALWAYS ask the user if the plan looks good before starting work + - Create the todos, let them render, then ask: "Does this plan look good?" or similar + - Wait for the user's response before marking the first todo as in_progress + - If they want changes, adjust the plan accordingly +6. Update todo status promptly as you complete each item + +The todo list is a planning tool - use it judiciously to avoid overwhelming the user with excessive task tracking. + +## System Information +- **Current User**: {user_identifier} +- **Current Time**: {datetime} diff --git a/task_queue/tasks.py b/task_queue/tasks.py index 50f04e6..7911599 100644 --- a/task_queue/tasks.py +++ b/task_queue/tasks.py @@ -19,7 +19,6 @@ from .config import huey from utils.file_utils import ( extract_zip_file, get_file_hash, - is_file_already_processed, load_processed_files_log, save_processed_files_log, get_document_preview diff --git a/utils/__init__.py b/utils/__init__.py index a3c12e0..0699d4b 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -9,7 +9,6 @@ from .file_utils import ( remove_file_or_directory, extract_zip_file, get_document_preview, - is_file_already_processed, load_processed_files_log, save_processed_files_log ) @@ -44,11 +43,6 @@ from .agent_pool import ( release_agent_to_pool ) -from .organize_dataset_files import ( - is_file_already_processed, - organize_single_project_files, - organize_dataset_files -) from .api_models import ( Message, @@ -77,8 +71,6 @@ from .api_models import ( from .multi_project_manager import ( create_robot_project, - get_robot_project_info, - cleanup_robot_project, get_unique_folder_name, copy_dataset_folder, generate_robot_readme @@ -96,7 +88,6 @@ __all__ = [ 'remove_file_or_directory', 'extract_zip_file', 'get_document_preview', - 'is_file_already_processed', 'load_processed_files_log', 'save_processed_files_log', @@ -122,10 +113,6 @@ __all__ = [ 'get_agent_from_pool', 'release_agent_to_pool', - # organize_dataset_files - 'is_file_already_processed', - 'organize_single_project_files', - 'organize_dataset_files', # api_models 'Message', @@ -152,8 +139,6 @@ __all__ = [ # multi_project_manager 'create_robot_project', - 'get_robot_project_info', - 'cleanup_robot_project', 'get_unique_folder_name', 'copy_dataset_folder', 'generate_robot_readme', diff --git a/utils/fastapi_utils.py b/utils/fastapi_utils.py index 3033796..e11e066 100644 --- a/utils/fastapi_utils.py +++ b/utils/fastapi_utils.py @@ -373,7 +373,8 @@ def create_project_directory(dataset_ids: Optional[List[str]], bot_id: str, robo try: from utils.multi_project_manager import create_robot_project - return create_robot_project(dataset_ids, bot_id) + from pathlib import Path + return create_robot_project(dataset_ids, bot_id, Path("~", ".deepagents")) except Exception as e: logger.error(f"Error creating project directory: {e}") return None diff --git a/utils/file_utils.py b/utils/file_utils.py index 8495d61..2cdb7c7 100644 --- a/utils/file_utils.py +++ b/utils/file_utils.py @@ -91,18 +91,6 @@ def get_document_preview(document_path: str, max_lines: int = 10) -> str: return f"Error reading document: {str(e)}" -def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool: - """Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)""" - if not target_file.exists(): - return False - - # Check if pagination and embeddings files exist and are not empty - if pagination_file.exists() and embeddings_file.exists(): - # Check file sizes to ensure they're not empty - if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0: - return True - - return False def load_processed_files_log(unique_id: str) -> Dict[str, Dict]: diff --git a/utils/multi_project_manager.py b/utils/multi_project_manager.py index 76cc16f..b7b167a 100644 --- a/utils/multi_project_manager.py +++ b/utils/multi_project_manager.py @@ -141,7 +141,7 @@ def get_unique_folder_name(target_dir: Path, original_name: str) -> str: counter += 1 -def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder_name: str) -> Dict: +def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder_name: str, project_path: Path) -> Dict: """ 复制单个项目的dataset文件夹到目标目录 @@ -149,6 +149,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder source_project_id: 源项目ID target_dataset_dir: 目标dataset目录 folder_name: 要复制的文件夹名称 + project_path: 项目路径 Returns: Dict: 复制结果 @@ -163,7 +164,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder } try: - source_folder = Path("projects") / "data" / source_project_id / "dataset" / folder_name + source_folder = project_path / "data" / source_project_id / "dataset" / folder_name result["source_path"] = str(source_folder) if not source_folder.exists(): @@ -190,7 +191,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder return result -def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: List[Dict]) -> str: +def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: List[Dict], project_path: Path) -> str: """ 生成机器人项目的README.md文件 @@ -202,10 +203,10 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L Returns: str: README.md文件路径 """ - readme_path = Path("projects") / "robot" / robot_id / "README.md" + readme_path = project_path / "robot" / robot_id / "README.md" readme_path.parent.mkdir(parents=True, exist_ok=True) - robot_dir = Path("projects") / "robot" / robot_id + robot_dir = project_path / "robot" / robot_id # 统计信息 total_folders = len(copy_results) @@ -300,7 +301,7 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L return str(readme_path) -def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool: +def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str, project_path: Path) -> bool: """ 检查是否需要重建机器人项目 1. 检查机器人项目是否存在 @@ -310,11 +311,12 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool: Args: dataset_ids: 源项目ID列表 bot_id: 机器人ID + project_path: 项目路径 Returns: bool: 是否需要重建 """ - robot_dir = Path("projects") / "robot" / bot_id + robot_dir = project_path / "robot" / bot_id # 如果机器人项目不存在,需要创建 if not robot_dir.exists(): @@ -356,7 +358,7 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool: # 检查每个源项目的processing_log.json文件 for source_project_id in dataset_ids: - log_file = Path("projects") / "data" / source_project_id / "processing_log.json" + log_file = project_path / "data" / source_project_id / "processing_log.json" if not log_file.exists(): logger.info(f"Processing log file not found for project {source_project_id}, will rebuild") @@ -373,7 +375,7 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool: return False -def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: bool = False) -> str: +def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: bool = False, project_path: Path = Path("projects")) -> str: """ 创建机器人项目,合并多个源项目的dataset文件夹 @@ -386,15 +388,15 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo str: 机器人项目目录路径 """ logger.info(f"Creating robot project: {bot_id} from sources: {dataset_ids}") - + # 检查是否需要重建 - if not force_rebuild and not should_rebuild_robot_project(dataset_ids, bot_id): - robot_dir = Path("projects") / "robot" / bot_id + if not force_rebuild and not should_rebuild_robot_project(dataset_ids, bot_id, project_path): + robot_dir = project_path / "robot" / bot_id logger.info(f"Using existing robot project: {robot_dir}") return str(robot_dir) # 创建机器人目录结构 - robot_dir = Path("projects") / "robot" / bot_id + robot_dir = project_path / "robot" / bot_id dataset_dir = robot_dir / "dataset" # 清理已存在的目录(如果需要) @@ -411,7 +413,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo for source_project_id in dataset_ids: logger.info(f"\nProcessing source project: {source_project_id}") - source_dataset_dir = Path("projects") / "data" / source_project_id / "dataset" + source_dataset_dir = project_path / "data" / source_project_id / "dataset" if not source_dataset_dir.exists(): logger.warning(f" Warning: Dataset directory not found for project {source_project_id}") @@ -426,7 +428,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo # 复制每个文件夹 for folder in folders: - result = copy_dataset_folder(source_project_id, dataset_dir, folder.name) + result = copy_dataset_folder(source_project_id, dataset_dir, folder.name, project_path) copy_results.append(result) # 保存配置信息 @@ -442,7 +444,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo json.dump(config_data, f, ensure_ascii=False, indent=2) # 生成README - readme_path = generate_robot_readme(bot_id, dataset_ids, copy_results) + readme_path = generate_robot_readme(bot_id, dataset_ids, copy_results, project_path) # 统计信息 successful_copies = sum(1 for r in copy_results if r["success"]) @@ -456,78 +458,6 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo return str(robot_dir) -def get_robot_project_info(bot_id: str) -> Dict: - """ - 获取机器人项目信息 - - Args: - bot_id: 机器人ID - - Returns: - Dict: 机器人项目信息 - """ - robot_dir = Path("projects") / "robot" / bot_id - - if not robot_dir.exists(): - return { - "exists": False, - "bot_id": bot_id, - "error": "Robot project does not exist" - } - - dataset_dir = robot_dir / "dataset" - readme_path = robot_dir / "README.md" - - # 统计文件夹数量 - folder_count = 0 - total_size = 0 - if dataset_dir.exists(): - for item in dataset_dir.iterdir(): - if item.is_dir(): - folder_count += 1 - # 计算文件夹大小 - for file_path in item.rglob('*'): - if file_path.is_file(): - total_size += file_path.stat().st_size - - return { - "exists": True, - "bot_id": bot_id, - "robot_dir": str(robot_dir), - "dataset_dir": str(dataset_dir), - "readme_exists": readme_path.exists(), - "folder_count": folder_count, - "total_size_bytes": total_size, - "total_size_mb": round(total_size / (1024 * 1024), 2) - } - - -def cleanup_robot_project(bot_id: str) -> bool: - """ - 清理机器人项目 - - Args: - bot_id: 机器人ID - - Returns: - bool: 清理是否成功 - """ - try: - robot_dir = Path("projects") / "robot" / bot_id - - if robot_dir.exists(): - shutil.rmtree(robot_dir) - logger.info(f"Cleaned up robot project: {bot_id}") - return True - else: - logger.info(f"Robot project does not exist: {bot_id}") - return True - - except Exception as e: - logger.error(f"Error cleaning up robot project {bot_id}: {str(e)}") - return False - - if __name__ == "__main__": # 测试代码 test_dataset_ids = ["test-project-1", "test-project-2"] @@ -536,5 +466,3 @@ if __name__ == "__main__": robot_dir = create_robot_project(test_dataset_ids, test_bot_id) logger.info(f"Created robot project at: {robot_dir}") - info = get_robot_project_info(test_bot_id) - logger.info(f"Robot project info: {json.dumps(info, indent=2, ensure_ascii=False)}") diff --git a/utils/organize_dataset_files.py b/utils/organize_dataset_files.py deleted file mode 100644 index 4cff748..0000000 --- a/utils/organize_dataset_files.py +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/env python3 -import os -import shutil -import logging -from pathlib import Path - -# 配置日志 -logger = logging.getLogger('app') - -def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool: - """Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)""" - if not target_file.exists(): - return False - - # Check if pagination and embeddings files exist and are not empty - if pagination_file.exists() and embeddings_file.exists(): - # Check file sizes to ensure they're not empty - if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0: - return True - - return False - -def organize_single_project_files(unique_id: str, skip_processed=True): - """Organize files for a single project from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt""" - - project_dir = Path("projects") / "data" / unique_id - - if not project_dir.exists(): - logger.error(f"Project directory not found: {project_dir}") - return - - logger.info(f"Organizing files for project: {unique_id} (skip_processed={skip_processed})") - - files_dir = project_dir / "files" - dataset_dir = project_dir / "dataset" - - # Check if files directory exists and has files - if not files_dir.exists(): - logger.info(f" No files directory found, skipping...") - return - - files = list(files_dir.glob("*")) - if not files: - logger.info(f" Files directory is empty, skipping...") - return - - # Create dataset directory if it doesn't exist - dataset_dir.mkdir(exist_ok=True) - - # Copy each file to its own directory - for file_path in files: - if file_path.is_file(): - # Get filename without extension as directory name - file_name_without_ext = file_path.stem - target_dir = dataset_dir / file_name_without_ext - target_file = target_dir / "document.txt" - pagination_file = target_dir / "pagination.txt" - embeddings_file = target_dir / "embedding.pkl" - - # Check if file is already processed - if skip_processed and is_file_already_processed(target_file, pagination_file, embeddings_file): - logger.info(f" Skipping already processed file: {file_path.name}") - continue - - logger.info(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}") - - # Create target directory - target_dir.mkdir(exist_ok=True) - - # Copy and rename file - shutil.copy2(str(file_path), str(target_file)) - - print(f" Files remain in original location (copied to dataset structure)") - - # Process each document.txt file: split pages and generate embeddings - if not skip_processed: - import sys - sys.path.append(os.path.join(os.path.dirname(__file__), 'embedding')) - - from embedding import split_document_by_pages, embed_document - - for file_path in files: - if file_path.is_file(): - file_name_without_ext = file_path.stem - target_dir = dataset_dir / file_name_without_ext - document_file = target_dir / "document.txt" - pagination_file = target_dir / "pagination.txt" - embeddings_file = target_dir / "embedding.pkl" - - # Skip if already processed - if is_file_already_processed(document_file, pagination_file, embeddings_file): - print(f" Skipping document processing for already processed file: {file_path.name}") - continue - - # Split document by pages - print(f" Splitting pages for {document_file.name}") - try: - pages = split_document_by_pages(str(document_file), str(pagination_file)) - print(f" Generated {len(pages)} pages") - except Exception as e: - print(f" Failed to split pages: {e}") - continue - - # Generate embeddings - print(f" Generating embeddings for {document_file.name}") - try: - # Use paragraph chunking strategy with default settings - embedding_data = embed_document( - str(document_file), - str(embeddings_file), - chunking_strategy='paragraph' - ) - - if embedding_data: - print(f" Generated embeddings for {len(embedding_data['chunks'])} chunks") - else: - print(f" Failed to generate embeddings") - except Exception as e: - print(f" Failed to generate embeddings: {e}") - - print(f" Document processing completed for project {unique_id}") - else: - print(f" Skipping document processing (skip_processed=True)") - - -def organize_dataset_files(): - """Move files from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt""" - - projects_dir = Path("projects") / "data" - - if not projects_dir.exists(): - print("Projects directory not found") - return - - # Get all project directories (exclude cache and other non-project dirs) - project_dirs = [d for d in projects_dir.iterdir() - if d.is_dir() and d.name != "_cache" and not d.name.startswith(".")] - - for project_dir in project_dirs: - print(f"\nProcessing project: {project_dir.name}") - - files_dir = project_dir / "files" - dataset_dir = project_dir / "dataset" - - # Check if files directory exists and has files - if not files_dir.exists(): - logger.info(f" No files directory found, skipping...") - continue - - files = list(files_dir.glob("*")) - if not files: - logger.info(f" Files directory is empty, skipping...") - continue - - # Create dataset directory if it doesn't exist - dataset_dir.mkdir(exist_ok=True) - - # Move each file to its own directory - for file_path in files: - if file_path.is_file(): - # Get filename without extension as directory name - file_name_without_ext = file_path.stem - target_dir = dataset_dir / file_name_without_ext - target_file = target_dir / "document.txt" - - logger.info(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}") - - # Create target directory - target_dir.mkdir(exist_ok=True) - - # Copy and rename file - shutil.copy2(str(file_path), str(target_file)) - - print(f" Files remain in original location (copied to dataset structure)") - - print("\nFile organization complete!") - -if __name__ == "__main__": - organize_dataset_files() diff --git a/utils/symlink_utils.py b/utils/symlink_utils.py index 0e83de4..bd7a3b0 100644 --- a/utils/symlink_utils.py +++ b/utils/symlink_utils.py @@ -54,6 +54,17 @@ def setup_deepagents_symlink(): logger.info(f"Removed existing symlink pointing to {target}") # Create the symbolic link + # Check again before creating to handle race conditions + if deepagents_dir.is_symlink() or deepagents_dir.exists(): + logger.warning(f"Path {deepagents_dir} exists, attempting to remove before symlink") + if deepagents_dir.is_symlink(): + deepagents_dir.unlink() + elif deepagents_dir.is_dir(): + import shutil + shutil.rmtree(str(deepagents_dir)) + else: + deepagents_dir.unlink() + os.symlink(robot_dir, deepagents_dir, target_is_directory=True) logger.info(f"Created symbolic link: {deepagents_dir} -> {robot_dir}") return True