diff --git a/agent/prompt_loader.py b/agent/prompt_loader.py index 6fe7142..1642d21 100644 --- a/agent/prompt_loader.py +++ b/agent/prompt_loader.py @@ -291,7 +291,7 @@ async def load_mcp_settings_async(config) -> List[Dict]: # 计算 dataset_dir 用于替换 MCP 配置中的占位符 # 只有当 project_dir 不为 None 时才计算 dataset_dir - dataset_dir = os.path.join(project_dir, "dataset") if project_dir is not None else None + dataset_dir = os.path.join(project_dir, "datasets") if project_dir is not None else None # 替换 MCP 配置中的 {dataset_dir} 占位符 if dataset_dir is None: dataset_dir = "" diff --git a/prompt/system_prompt.md b/prompt/system_prompt.md index a796538..648a52a 100644 --- a/prompt/system_prompt.md +++ b/prompt/system_prompt.md @@ -54,7 +54,7 @@ When executing scripts from SKILL.md files, you MUST convert relative paths to a **4. Workspace Directory Structure** - **`{agent_dir_path}/skills/`** - Skill packages with embedded scripts -- **`{agent_dir_path}/dataset/`** - Store file datasets and document data +- **`{agent_dir_path}/datasets/`** - Store file datasets and document data - **`{agent_dir_path}/executable_code/`** - Place generated executable scripts here (not skill scripts) - **`{agent_dir_path}/download/`** - Store downloaded files and content @@ -75,7 +75,7 @@ When creating scripts in `executable_code/`, follow these organization rules: **Path Examples:** - Skill script: `{agent_dir_path}/skills/rag-retrieve/scripts/rag_retrieve.py` -- Dataset file: `{agent_dir_path}/dataset/document.txt` +- Dataset file: `{agent_dir_path}/datasets/document.txt` - Task-specific script: `{agent_dir_path}/executable_code/invoice_parser/parse.py` - Temporary script (when needed): `{agent_dir_path}/executable_code/tmp/test.py` - Downloaded file: `{agent_dir_path}/download/report.pdf` diff --git a/routes/files.py b/routes/files.py index 0887eaf..1e505ec 100644 --- a/routes/files.py +++ b/routes/files.py @@ -213,7 +213,7 @@ async def reset_files_processing(dataset_id: str): elif 'filename' in file_info: # Fallback to old filename-based structure filename_without_ext = os.path.splitext(file_info['filename'])[0] - dataset_dir = os.path.join("projects", "data", dataset_id, "dataset", filename_without_ext) + dataset_dir = os.path.join("projects", "data", dataset_id, "datasets", filename_without_ext) if remove_file_or_directory(dataset_dir): removed_files.append(dataset_dir) @@ -232,7 +232,7 @@ async def reset_files_processing(dataset_id: str): removed_files.append(files_dir) # Also remove the entire dataset directory (clean up any remaining files) - dataset_dir = os.path.join(project_dir, "dataset") + dataset_dir = os.path.join(project_dir, "datasets") if remove_file_or_directory(dataset_dir): removed_files.append(dataset_dir) @@ -465,4 +465,4 @@ async def cleanup_tasks(older_than_days: int = 7): except Exception as e: logger.error(f"Error cleaning up tasks: {str(e)}") - raise HTTPException(status_code=500, detail=f"清理任务记录失败: {str(e)}") \ No newline at end of file + raise HTTPException(status_code=500, detail=f"清理任务记录失败: {str(e)}") diff --git a/routes/projects.py b/routes/projects.py index 87edc50..0a84377 100644 --- a/routes/projects.py +++ b/routes/projects.py @@ -33,8 +33,8 @@ async def list_all_projects(): # 统计文件数量 file_count = 0 - if os.path.exists(os.path.join(item_path, "dataset")): - for root, dirs, files in os.walk(os.path.join(item_path, "dataset")): + if os.path.exists(os.path.join(item_path, "datasets")): + for root, dirs, files in os.walk(os.path.join(item_path, "datasets")): file_count += len(files) robot_projects.append({ @@ -173,4 +173,4 @@ async def get_project_tasks(dataset_id: str): except Exception as e: logger.error(f"Error getting project tasks: {str(e)}") - raise HTTPException(status_code=500, detail=f"获取项目任务失败: {str(e)}") \ No newline at end of file + raise HTTPException(status_code=500, detail=f"获取项目任务失败: {str(e)}") diff --git a/task_queue/integration_tasks.py b/task_queue/integration_tasks.py index 1575cfc..b042b3d 100644 --- a/task_queue/integration_tasks.py +++ b/task_queue/integration_tasks.py @@ -181,7 +181,7 @@ def process_files_async( result_files = [] for key in processed_files_by_key.keys(): # 添加对应的dataset document.txt路径 - document_path = os.path.join("projects", "data", dataset_id, "dataset", key, "document.txt") + document_path = os.path.join("projects", "data", dataset_id, "datasets", key, "document.txt") if os.path.exists(document_path): result_files.append(document_path) @@ -382,7 +382,7 @@ def process_files_incremental_async( result_files = [] for key in processed_files_by_key.keys(): # 添加对应的dataset document.txt路径 - document_path = os.path.join("projects", "data", dataset_id, "dataset", key, "document.txt") + document_path = os.path.join("projects", "data", dataset_id, "datasets", key, "document.txt") if os.path.exists(document_path): result_files.append(document_path) @@ -496,4 +496,4 @@ def cleanup_project_async( "message": error_msg, "dataset_id": dataset_id, "error": str(e) - } \ No newline at end of file + } diff --git a/utils/data_merger.py b/utils/data_merger.py index 59bbd98..0bf2bba 100644 --- a/utils/data_merger.py +++ b/utils/data_merger.py @@ -25,7 +25,7 @@ def merge_documents_by_group(unique_id: str, group_name: str) -> Dict: """Merge all document.txt files in a group into a single document.""" processed_group_dir = os.path.join("projects", "data", unique_id, "processed", group_name) - dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name) + dataset_group_dir = os.path.join("projects", "data", unique_id, "datasets", group_name) os.makedirs(dataset_group_dir, exist_ok=True) merged_document_path = os.path.join(dataset_group_dir, "document.txt") @@ -96,7 +96,7 @@ def merge_paginations_by_group(unique_id: str, group_name: str) -> Dict: """Merge all pagination.txt files in a group.""" processed_group_dir = os.path.join("projects", "data", unique_id, "processed", group_name) - dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name) + dataset_group_dir = os.path.join("projects", "data", unique_id, "datasets", group_name) os.makedirs(dataset_group_dir, exist_ok=True) merged_pagination_path = os.path.join(dataset_group_dir, "pagination.txt") @@ -166,7 +166,7 @@ def merge_embeddings_by_group(unique_id: str, group_name: str) -> Dict: """Merge all embedding.pkl files in a group.""" processed_group_dir = os.path.join("projects", "data", unique_id, "processed", group_name) - dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name) + dataset_group_dir = os.path.join("projects", "data", unique_id, "datasets", group_name) os.makedirs(dataset_group_dir, exist_ok=True) merged_embedding_path = os.path.join(dataset_group_dir, "embedding.pkl") @@ -379,7 +379,7 @@ def merge_all_data_by_group(unique_id: str, group_name: str) -> Dict: def get_group_merge_status(unique_id: str, group_name: str) -> Dict: """Get the status of merged data for a group.""" - dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name) + dataset_group_dir = os.path.join("projects", "data", unique_id, "datasets", group_name) status = { "group_name": group_name, @@ -423,7 +423,7 @@ def get_group_merge_status(unique_id: str, group_name: str) -> Dict: def cleanup_dataset_group(unique_id: str, group_name: str) -> bool: """Clean up merged dataset files for a group.""" - dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name) + dataset_group_dir = os.path.join("projects", "data", unique_id, "datasets", group_name) try: if os.path.exists(dataset_group_dir): diff --git a/utils/dataset_manager.py b/utils/dataset_manager.py index ef73697..5526241 100644 --- a/utils/dataset_manager.py +++ b/utils/dataset_manager.py @@ -200,7 +200,7 @@ def generate_dataset_structure(unique_id: str) -> str: add_directory_contents(processed_dir, "") # Add dataset directory structure - dataset_dir = os.path.join(project_dir, "dataset") + dataset_dir = os.path.join(project_dir, "datasets") structure.append("\ndataset/") add_directory_contents(dataset_dir, "") @@ -224,7 +224,7 @@ def get_processing_status(unique_id: str) -> Dict: "directories": { "files": os.path.exists(os.path.join(project_dir, "files")), "processed": os.path.exists(os.path.join(project_dir, "processed")), - "dataset": os.path.exists(os.path.join(project_dir, "dataset")) + "dataset": os.path.exists(os.path.join(project_dir, "datasets")) }, "groups": {}, "processing_log_exists": os.path.exists(os.path.join(project_dir, "processing_log.json")) @@ -245,7 +245,7 @@ def get_processing_status(unique_id: str) -> Dict: } # Check merge status for each group - dataset_dir = os.path.join(project_dir, "dataset") + dataset_dir = os.path.join(project_dir, "datasets") if os.path.exists(dataset_dir): for group_name in os.listdir(dataset_dir): group_path = os.path.join(dataset_dir, group_name) @@ -294,4 +294,4 @@ def remove_dataset_directory_by_key(unique_id: str, key: str): shutil.rmtree(processed_group_path) # Remove dataset directory - cleanup_dataset_group(unique_id, key) \ No newline at end of file + cleanup_dataset_group(unique_id, key) diff --git a/utils/file_manager.py b/utils/file_manager.py index 191e45f..a3de760 100644 --- a/utils/file_manager.py +++ b/utils/file_manager.py @@ -228,7 +228,7 @@ def cleanup_orphaned_files(unique_id: str, changes: Dict) -> Dict[str, List[str] removed_files[group_name].append("processed group directory") # Remove entire dataset/group directory - dataset_group_dir = os.path.join(project_dir, "dataset", group_name) + dataset_group_dir = os.path.join(project_dir, "datasets", group_name) if os.path.exists(dataset_group_dir): shutil.rmtree(dataset_group_dir) removed_files[group_name].append("dataset group directory") @@ -263,9 +263,9 @@ def ensure_directories(unique_id: str): directories = [ "files", "processed", - "dataset" + "datasets" ] for dir_name in directories: dir_path = os.path.join(base_dir, dir_name) - os.makedirs(dir_path, exist_ok=True) \ No newline at end of file + os.makedirs(dir_path, exist_ok=True) diff --git a/utils/file_utils.py b/utils/file_utils.py index 2cdb7c7..71055a2 100644 --- a/utils/file_utils.py +++ b/utils/file_utils.py @@ -267,7 +267,7 @@ def get_project_statistics(unique_id: str) -> Dict: } # Check each directory - directories = ["files", "processed", "dataset"] + directories = ["files", "processed", "datasets"] for dir_name in directories: dir_path = os.path.join(project_dir, dir_name) @@ -293,4 +293,4 @@ def get_project_statistics(unique_id: str) -> Dict: "files": 0 } - return stats \ No newline at end of file + return stats diff --git a/utils/multi_project_manager.py b/utils/multi_project_manager.py index fceace2..be29923 100644 --- a/utils/multi_project_manager.py +++ b/utils/multi_project_manager.py @@ -75,7 +75,7 @@ def generate_robot_directory_tree(robot_dir: str, robot_id: str, max_depth: int return lines # 从dataset目录开始构建树 - dataset_dir = os.path.join(robot_dir, "dataset") + dataset_dir = os.path.join(robot_dir, "datasets") tree_lines = [] if not os.path.exists(dataset_dir): @@ -165,7 +165,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder } try: - source_folder = project_path / "data" / source_project_id / "dataset" / folder_name + source_folder = project_path / "data" / source_project_id / "datasets" / folder_name result["source_path"] = str(source_folder) if not source_folder.exists(): @@ -231,7 +231,7 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L readme_content += "## 数据集详情\n\n" - dataset_dir = robot_dir / "dataset" + dataset_dir = robot_dir / "datasets" if not dataset_dir.exists(): readme_content += "No dataset files available.\n" else: @@ -324,7 +324,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo # 创建机器人目录结构(如果不存在) robot_dir = _get_robot_dir(project_path, bot_id) - dataset_dir = robot_dir / "dataset" + dataset_dir = robot_dir / "datasets" scripts_dir = robot_dir / "scripts" download_dir = robot_dir / "download" diff --git a/utils/project_manager.py b/utils/project_manager.py index 614d3aa..70a2944 100644 --- a/utils/project_manager.py +++ b/utils/project_manager.py @@ -62,7 +62,7 @@ def generate_directory_tree(project_dir: str, unique_id: str, max_depth: int = 3 return lines # Start building tree from dataset directory - dataset_dir = os.path.join(project_dir, "dataset") + dataset_dir = os.path.join(project_dir, "datasets") tree_lines = [] if not os.path.exists(dataset_dir): @@ -128,7 +128,7 @@ This project contains processed documents and their associated embeddings for se """ - dataset_dir = os.path.join(project_dir, "dataset") + dataset_dir = os.path.join(project_dir, "datasets") if not os.path.exists(dataset_dir): readme_content += "No dataset files available.\n" else: @@ -217,7 +217,7 @@ def get_project_status(unique_id: str) -> Dict: # Collect document.txt files document_files = [] - dataset_dir = os.path.join(project_dir, "dataset") + dataset_dir = os.path.join(project_dir, "datasets") if os.path.exists(dataset_dir): for root, dirs, files in os.walk(dataset_dir): for file in files: @@ -321,7 +321,7 @@ def get_project_stats(unique_id: str) -> Dict: # Check embeddings files embedding_files = [] - dataset_dir = os.path.join("projects", "data", unique_id, "dataset") + dataset_dir = os.path.join("projects", "data", unique_id, "datasets") if os.path.exists(dataset_dir): for root, dirs, files in os.walk(dataset_dir): for file in files: