diff --git a/fastapi_app.py b/fastapi_app.py index 555f409..9d020ea 100644 --- a/fastapi_app.py +++ b/fastapi_app.py @@ -201,16 +201,16 @@ async def process_files_async_endpoint(request: QueueTaskRequest, authorization: 与 /api/v1/files/process 功能相同,但使用队列异步处理 Args: - request: QueueTaskRequest containing unique_id, files, system_prompt, mcp_settings, and queue options + request: QueueTaskRequest containing dataset_id, files, system_prompt, mcp_settings, and queue options authorization: Authorization header containing API key (Bearer ) Returns: QueueTaskResponse: Processing result with task ID for tracking """ try: - unique_id = request.unique_id - if not unique_id: - raise HTTPException(status_code=400, detail="unique_id is required") + dataset_id = request.dataset_id + if not dataset_id: + raise HTTPException(status_code=400, detail="dataset_id is required") # 估算处理时间(基于文件数量) estimated_time = 0 @@ -220,7 +220,7 @@ async def process_files_async_endpoint(request: QueueTaskRequest, authorization: # 提交异步任务 task_id = queue_manager.enqueue_multiple_files( - project_id=unique_id, + project_id=dataset_id, file_paths=[], original_filenames=[] ) @@ -230,13 +230,13 @@ async def process_files_async_endpoint(request: QueueTaskRequest, authorization: task_id = str(uuid.uuid4()) task_status_store.set_status( task_id=task_id, - unique_id=unique_id, + unique_id=dataset_id, status="pending" ) # 提交异步任务 task = process_files_async( - unique_id=unique_id, + unique_id=dataset_id, files=request.files, system_prompt=request.system_prompt, mcp_settings=request.mcp_settings, @@ -245,8 +245,8 @@ async def process_files_async_endpoint(request: QueueTaskRequest, authorization: return QueueTaskResponse( success=True, - message=f"文件处理任务已提交到队列,项目ID: {unique_id}", - unique_id=unique_id, + message=f"文件处理任务已提交到队列,项目ID: {dataset_id}", + unique_id=dataset_id, task_id=task_id, # 使用我们自己的task_id task_status="pending", estimated_processing_time=estimated_time @@ -313,12 +313,12 @@ async def delete_task(task_id: str): @app.get("/api/v1/tasks") -async def list_tasks(status: Optional[str] = None, unique_id: Optional[str] = None, limit: int = 100): +async def list_tasks(status: Optional[str] = None, dataset_id: Optional[str] = None, limit: int = 100): """列出任务,支持筛选""" try: - if status or unique_id: + if status or dataset_id: # 使用搜索功能 - tasks = task_status_store.search_tasks(status=status, unique_id=unique_id, limit=limit) + tasks = task_status_store.search_tasks(status=status, unique_id=dataset_id, limit=limit) else: # 获取所有任务 all_tasks = task_status_store.list_all() @@ -331,7 +331,7 @@ async def list_tasks(status: Optional[str] = None, unique_id: Optional[str] = No "tasks": tasks, "filters": { "status": status, - "unique_id": unique_id, + "dataset_id": dataset_id, "limit": limit } } @@ -376,16 +376,16 @@ async def cleanup_tasks(older_than_days: int = 7): raise HTTPException(status_code=500, detail=f"清理任务记录失败: {str(e)}") -@app.get("/api/v1/projects/{unique_id}/tasks") -async def get_project_tasks(unique_id: str): +@app.get("/api/v1/projects/{dataset_id}/tasks") +async def get_project_tasks(dataset_id: str): """获取指定项目的所有任务""" try: - tasks = task_status_store.get_by_unique_id(unique_id) + tasks = task_status_store.get_by_unique_id(dataset_id) return { "success": True, "message": "项目任务获取成功", - "unique_id": unique_id, + "dataset_id": dataset_id, "total_tasks": len(tasks), "tasks": tasks } @@ -395,16 +395,16 @@ async def get_project_tasks(unique_id: str): raise HTTPException(status_code=500, detail=f"获取项目任务失败: {str(e)}") -@app.post("/api/v1/files/{unique_id}/cleanup/async") -async def cleanup_project_async_endpoint(unique_id: str, remove_all: bool = False): +@app.post("/api/v1/files/{dataset_id}/cleanup/async") +async def cleanup_project_async_endpoint(dataset_id: str, remove_all: bool = False): """异步清理项目文件""" try: - task = cleanup_project_async(unique_id=unique_id, remove_all=remove_all) + task = cleanup_project_async(unique_id=dataset_id, remove_all=remove_all) return { "success": True, - "message": f"项目清理任务已提交到队列,项目ID: {unique_id}", - "unique_id": unique_id, + "message": f"项目清理任务已提交到队列,项目ID: {dataset_id}", + "dataset_id": dataset_id, "task_id": task.id, "action": "remove_all" if remove_all else "cleanup_logs" } @@ -419,11 +419,25 @@ async def chat_completions(request: ChatRequest, authorization: Optional[str] = Chat completions API similar to OpenAI, supports both streaming and non-streaming Args: - request: ChatRequest containing messages, model, dataset with unique_id, system_prompt, mcp_settings, and files + request: ChatRequest containing messages, model, dataset_ids (required list), required bot_id, system_prompt, mcp_settings, and files authorization: Authorization header containing API key (Bearer ) Returns: Union[ChatResponse, StreamingResponse]: Chat completion response or stream + + Notes: + - dataset_ids: 必需参数,必须是项目ID列表(单个项目也使用数组格式) + - bot_id: 必需参数,机器人ID,用于创建项目目录 + - 所有请求都会创建机器人项目目录:projects/robot/{bot_id}/ + - 支持多知识库合并,自动处理文件夹重名冲突 + + Required Parameters: + - dataset_ids: List[str] - 源知识库项目ID列表(单个项目也使用数组格式) + - bot_id: str - 目标机器人项目ID + + Example: + {"dataset_ids": ["project-123"], "bot_id": "my-bot-001"} + {"dataset_ids": ["project-123", "project-456"], "bot_id": "my-bot-002"} """ try: # 从Authorization header中提取API key @@ -435,23 +449,27 @@ async def chat_completions(request: ChatRequest, authorization: Optional[str] = else: api_key = authorization - # 获取unique_id - unique_id = request.unique_id - if not unique_id: - raise HTTPException(status_code=400, detail="unique_id is required") + # 获取dataset_ids(必需参数,必须是数组) + dataset_ids_list = request.dataset_ids + if not dataset_ids_list: + raise HTTPException(status_code=400, detail="dataset_ids is required and must be a non-empty list") - # 使用unique_id获取项目目录 - project_dir = os.path.join("projects", "data", unique_id) - if not os.path.exists(project_dir): - project_dir = "" + # 获取bot_id(必需参数) + bot_id = request.bot_id + if not bot_id: + raise HTTPException(status_code=400, detail="bot_id is required") + + # 创建机器人目录并合并数据 + from utils.multi_project_manager import create_robot_project + project_dir = create_robot_project(dataset_ids_list, bot_id) # 收集额外参数作为 generate_cfg - exclude_fields = {'messages', 'model', 'model_server', 'unique_id', 'language', 'tool_response', 'system_prompt', 'mcp_settings' ,'stream', 'robot_type'} + exclude_fields = {'messages', 'model', 'model_server', 'dataset_ids', 'language', 'tool_response', 'system_prompt', 'mcp_settings' ,'stream', 'robot_type', 'bot_id'} generate_cfg = {k: v for k, v in request.model_dump().items() if k not in exclude_fields} # 从全局管理器获取或创建助手实例(配置读取逻辑已在agent_manager内部处理) agent = await agent_manager.get_or_create_agent( - unique_id=unique_id, + bot_id=bot_id, project_dir=project_dir, model_name=request.model, api_key=api_key, @@ -636,27 +654,27 @@ async def get_cached_projects(): @app.post("/system/remove-project-cache") -async def remove_project_cache(unique_id: str): +async def remove_project_cache(dataset_id: str): """移除特定项目的缓存""" try: - removed_count = agent_manager.remove_cache_by_unique_id(unique_id) + removed_count = agent_manager.remove_cache_by_unique_id(dataset_id) if removed_count > 0: - return {"message": f"项目缓存移除成功: {unique_id}", "removed_count": removed_count} + return {"message": f"项目缓存移除成功: {dataset_id}", "removed_count": removed_count} else: - return {"message": f"未找到项目缓存: {unique_id}", "removed_count": 0} + return {"message": f"未找到项目缓存: {dataset_id}", "removed_count": 0} except Exception as e: raise HTTPException(status_code=500, detail=f"移除项目缓存失败: {str(e)}") -@app.get("/api/v1/files/{unique_id}/status") -async def get_files_processing_status(unique_id: str): +@app.get("/api/v1/files/{dataset_id}/status") +async def get_files_processing_status(dataset_id: str): """获取项目的文件处理状态""" try: # Load processed files log - processed_log = load_processed_files_log(unique_id) + processed_log = load_processed_files_log(dataset_id) # Get project directory info - project_dir = os.path.join("projects", "data", unique_id) + project_dir = os.path.join("projects", "data", dataset_id) project_exists = os.path.exists(project_dir) # Collect document.txt files @@ -668,27 +686,27 @@ async def get_files_processing_status(unique_id: str): document_files.append(os.path.join(root, file)) return { - "unique_id": unique_id, + "dataset_id": dataset_id, "project_exists": project_exists, "processed_files_count": len(processed_log), "processed_files": processed_log, "document_files_count": len(document_files), "document_files": document_files, - "log_file_exists": os.path.exists(os.path.join("projects", "data", unique_id, "processed_files.json")) + "log_file_exists": os.path.exists(os.path.join("projects", "data", dataset_id, "processed_files.json")) } except Exception as e: raise HTTPException(status_code=500, detail=f"获取文件处理状态失败: {str(e)}") -@app.post("/api/v1/files/{unique_id}/reset") -async def reset_files_processing(unique_id: str): +@app.post("/api/v1/files/{dataset_id}/reset") +async def reset_files_processing(dataset_id: str): """重置项目的文件处理状态,删除处理日志和所有文件""" try: - project_dir = os.path.join("projects", "data", unique_id) - log_file = os.path.join("projects", "data", unique_id, "processed_files.json") + project_dir = os.path.join("projects", "data", dataset_id) + log_file = os.path.join("projects", "data", dataset_id, "processed_files.json") # Load processed log to know what files to remove - processed_log = load_processed_files_log(unique_id) + processed_log = load_processed_files_log(dataset_id) removed_files = [] # Remove all processed files and their dataset directories @@ -702,12 +720,12 @@ async def reset_files_processing(unique_id: str): if 'key' in file_info: # Remove dataset directory by key key = file_info['key'] - if remove_dataset_directory_by_key(unique_id, key): + if remove_dataset_directory_by_key(dataset_id, key): removed_files.append(f"dataset/{key}") elif 'filename' in file_info: # Fallback to old filename-based structure filename_without_ext = os.path.splitext(file_info['filename'])[0] - dataset_dir = os.path.join("projects", "data", unique_id, "dataset", filename_without_ext) + dataset_dir = os.path.join("projects", "data", dataset_id, "dataset", filename_without_ext) if remove_file_or_directory(dataset_dir): removed_files.append(dataset_dir) @@ -736,7 +754,7 @@ async def reset_files_processing(unique_id: str): removed_files.append(readme_file) return { - "message": f"文件处理状态重置成功: {unique_id}", + "message": f"文件处理状态重置成功: {dataset_id}", "removed_files_count": len(removed_files), "removed_files": removed_files } diff --git a/mcp/mcp_settings_agent.json b/mcp/mcp_settings_agent.json index be9a640..79b7f15 100644 --- a/mcp/mcp_settings_agent.json +++ b/mcp/mcp_settings_agent.json @@ -5,7 +5,7 @@ "command": "python", "args": [ "./mcp/rag_retrieve_server.py", - "{unique_id}" + "{bot_id}" ] } } diff --git a/utils/__init__.py b/utils/__init__.py index ca3d743..51ea8ab 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -80,6 +80,15 @@ from .prompt_loader import ( load_system_prompt, ) +from .multi_project_manager import ( + create_robot_project, + get_robot_project_info, + cleanup_robot_project, + get_unique_folder_name, + copy_dataset_folder, + generate_robot_readme +) + __all__ = [ # file_utils 'download_file', @@ -148,4 +157,12 @@ __all__ = [ # prompt_loader 'load_system_prompt', + + # multi_project_manager + 'create_robot_project', + 'get_robot_project_info', + 'cleanup_robot_project', + 'get_unique_folder_name', + 'copy_dataset_folder', + 'generate_robot_readme', ] diff --git a/utils/api_models.py b/utils/api_models.py index 03e7176..5cf5e9b 100644 --- a/utils/api_models.py +++ b/utils/api_models.py @@ -43,7 +43,8 @@ class ChatRequest(BaseModel): messages: List[Message] model: str = "qwen3-next" model_server: str = "" - unique_id: Optional[str] = None + dataset_ids: List[str] + bot_id: str stream: Optional[bool] = False language: Optional[str] = "ja" tool_response: Optional[bool] = False diff --git a/utils/file_loaded_agent_manager.py b/utils/file_loaded_agent_manager.py index d029f18..1e12285 100644 --- a/utils/file_loaded_agent_manager.py +++ b/utils/file_loaded_agent_manager.py @@ -39,11 +39,11 @@ class FileLoadedAgentManager: self.creation_times: Dict[str, float] = {} # 创建时间记录 self.max_cached_agents = max_cached_agents - def _get_cache_key(self, unique_id: str, system_prompt: str = None, mcp_settings: List[Dict] = None) -> str: - """获取包含 unique_id、system_prompt 和 mcp_settings 的哈希值作为缓存键 + def _get_cache_key(self, bot_id: str, system_prompt: str = None, mcp_settings: List[Dict] = None) -> str: + """获取包含 bot_id、system_prompt 和 mcp_settings 的哈希值作为缓存键 Args: - unique_id: 项目的唯一标识符 + bot_id: 机器人项目ID system_prompt: 系统提示词 mcp_settings: MCP设置列表 @@ -52,7 +52,7 @@ class FileLoadedAgentManager: """ # 构建包含所有相关参数的字符串 cache_data = { - 'unique_id': unique_id, + 'bot_id': bot_id, 'system_prompt': system_prompt or '', 'mcp_settings': json.dumps(mcp_settings or [], sort_keys=True) } @@ -91,7 +91,7 @@ class FileLoadedAgentManager: logger.info(f"已清理 {removed_count} 个过期的助手实例缓存") async def get_or_create_agent(self, - unique_id: str, + bot_id: str, project_dir: str, model_name: str = "qwen3-next", api_key: Optional[str] = None, @@ -104,7 +104,7 @@ class FileLoadedAgentManager: """获取或创建文件预加载的助手实例 Args: - unique_id: 项目的唯一标识符 + bot_id: 项目的唯一标识符 project_dir: 项目目录路径,用于读取system_prompt.md和mcp_settings.json model_name: 模型名称 api_key: API 密钥 @@ -121,10 +121,10 @@ class FileLoadedAgentManager: import os # 实现参数优先级逻辑:传入参数 > 项目配置 > 默认配置 - final_system_prompt = load_system_prompt(project_dir, language, system_prompt, robot_type, unique_id) - final_mcp_settings = load_mcp_settings(project_dir, mcp_settings, unique_id, robot_type) + final_system_prompt = load_system_prompt(project_dir, language, system_prompt, robot_type, bot_id) + final_mcp_settings = load_mcp_settings(project_dir, mcp_settings, bot_id, robot_type) - cache_key = self._get_cache_key(unique_id, final_system_prompt, final_mcp_settings) + cache_key = self._get_cache_key(bot_id, final_system_prompt, final_mcp_settings) # 检查是否已存在该助手实例 if cache_key in self.agents: @@ -134,14 +134,14 @@ class FileLoadedAgentManager: # 动态更新 LLM 配置和系统设置(如果参数有变化) update_agent_llm(agent, model_name, api_key, model_server, generate_cfg) - logger.info(f"复用现有的助手实例缓存: {cache_key} (unique_id: {unique_id}") + logger.info(f"复用现有的助手实例缓存: {cache_key} (bot_id: {bot_id}") return agent # 清理过期实例 self._cleanup_old_agents() # 创建新的助手实例,预加载文件 - logger.info(f"创建新的助手实例缓存: {cache_key}, unique_id: {unique_id}") + logger.info(f"创建新的助手实例缓存: {cache_key}, bot_id: {bot_id}") current_time = time.time() agent = init_modified_agent_service_with_files( @@ -155,7 +155,7 @@ class FileLoadedAgentManager: # 缓存实例 self.agents[cache_key] = agent - self.unique_ids[cache_key] = unique_id + self.unique_ids[cache_key] = bot_id self.access_times[cache_key] = current_time self.creation_times[cache_key] = current_time diff --git a/utils/multi_project_manager.py b/utils/multi_project_manager.py new file mode 100644 index 0000000..5321c2c --- /dev/null +++ b/utils/multi_project_manager.py @@ -0,0 +1,536 @@ +#!/usr/bin/env python3 +""" +多项目管理器:处理多个知识库项目的合并 +""" + +import os +import shutil +import json +from pathlib import Path +from typing import List, Dict, Optional +from datetime import datetime + +from utils.file_utils import get_document_preview + + +def generate_robot_directory_tree(robot_dir: str, robot_id: str, max_depth: int = 3) -> str: + """ + 生成机器人项目的目录树结构 + + Args: + robot_dir: 机器人项目目录路径 + robot_id: 机器人ID + max_depth: 最大深度 + + Returns: + str: 目录树字符串 + """ + def _build_tree(path: str, prefix: str = "", is_last: bool = True, depth: int = 0) -> List[str]: + if depth > max_depth: + return [] + + lines = [] + try: + entries = sorted(os.listdir(path)) + # 分离目录和文件 + dirs = [e for e in entries if os.path.isdir(os.path.join(path, e)) and not e.startswith('.')] + files = [e for e in entries if os.path.isfile(os.path.join(path, e)) and not e.startswith('.')] + + entries = dirs + files + + for i, entry in enumerate(entries): + entry_path = os.path.join(path, entry) + is_dir = os.path.isdir(entry_path) + is_last_entry = i == len(entries) - 1 + + # 选择适当的树形符号 + if is_last_entry: + connector = "└── " + new_prefix = prefix + " " + else: + connector = "├── " + new_prefix = prefix + "│ " + + # 添加条目行 + line = prefix + connector + entry + if is_dir: + line += "/" + lines.append(line) + + # 递归添加子目录 + if is_dir and depth < max_depth: + sub_lines = _build_tree(entry_path, new_prefix, is_last_entry, depth + 1) + lines.extend(sub_lines) + + except PermissionError: + lines.append(prefix + "└── [Permission Denied]") + except Exception as e: + lines.append(prefix + "└── [Error: " + str(e) + "]") + + return lines + + # 从dataset目录开始构建树 + dataset_dir = os.path.join(robot_dir, "dataset") + tree_lines = [] + + if not os.path.exists(dataset_dir): + return "└── [No dataset directory found]" + + try: + entries = sorted(os.listdir(dataset_dir)) + dirs = [e for e in entries if os.path.isdir(os.path.join(dataset_dir, e)) and not e.startswith('.')] + files = [e for e in entries if os.path.isfile(os.path.join(dataset_dir, e)) and not e.startswith('.')] + + entries = dirs + files + + if not entries: + tree_lines.append("└── [Empty dataset directory]") + else: + for i, entry in enumerate(entries): + entry_path = os.path.join(dataset_dir, entry) + is_dir = os.path.isdir(entry_path) + is_last_entry = i == len(entries) - 1 + + if is_last_entry: + connector = "└── " + prefix = " " + else: + connector = "├── " + prefix = "│ " + + line = connector + entry + if is_dir: + line += "/" + tree_lines.append(line) + + # 递归添加子目录 + if is_dir: + sub_lines = _build_tree(entry_path, prefix, is_last_entry, 1) + tree_lines.extend(sub_lines) + + except Exception as e: + tree_lines.append(f"└── [Error generating tree: {str(e)}]") + + return "\n".join(tree_lines) + + +def get_unique_folder_name(target_dir: Path, original_name: str) -> str: + """ + 获取唯一的文件夹名称,如果存在重名则添加数字后缀 + + Args: + target_dir: 目标目录 + original_name: 原始文件夹名称 + + Returns: + str: 唯一的文件夹名称 + """ + if not (target_dir / original_name).exists(): + return original_name + + # 存在重名,添加数字后缀 + counter = 1 + while True: + new_name = f"{original_name}_{counter}" + if not (target_dir / new_name).exists(): + return new_name + counter += 1 + + +def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder_name: str) -> Dict: + """ + 复制单个项目的dataset文件夹到目标目录 + + Args: + source_project_id: 源项目ID + target_dataset_dir: 目标dataset目录 + folder_name: 要复制的文件夹名称 + + Returns: + Dict: 复制结果 + """ + result = { + "success": False, + "source_path": "", + "target_path": "", + "original_folder_name": folder_name, + "final_folder_name": folder_name, + "error": None + } + + try: + source_folder = Path("projects") / "data" / source_project_id / "dataset" / folder_name + result["source_path"] = str(source_folder) + + if not source_folder.exists(): + result["error"] = f"Source folder does not exist: {source_folder}" + return result + + # 处理重名冲突 + unique_folder_name = get_unique_folder_name(target_dataset_dir, folder_name) + result["final_folder_name"] = unique_folder_name + + target_folder = target_dataset_dir / unique_folder_name + result["target_path"] = str(target_folder) + + # 复制文件夹 + shutil.copytree(source_folder, target_folder) + result["success"] = True + + print(f" Copied: {source_folder} -> {target_folder}") + + except Exception as e: + result["error"] = str(e) + print(f" Error copying {folder_name}: {str(e)}") + + return result + + +def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: List[Dict]) -> str: + """ + 生成机器人项目的README.md文件 + + Args: + robot_id: 机器人ID + dataset_ids: 源项目ID列表 + copy_results: 复制结果列表 + + Returns: + str: README.md文件路径 + """ + readme_path = Path("projects") / "robot" / robot_id / "README.md" + readme_path.parent.mkdir(parents=True, exist_ok=True) + + robot_dir = Path("projects") / "robot" / robot_id + + # 统计信息 + total_folders = len(copy_results) + successful_copies = sum(1 for r in copy_results if r["success"]) + failed_copies = total_folders - successful_copies + + # 按源项目分组 + project_groups = {} + for result in copy_results: + if result["success"]: + source_project = result["source_path"].split("/")[2] # projects/data/{project_id}/dataset/... + if source_project not in project_groups: + project_groups[source_project] = [] + project_groups[source_project].append(result) + + readme_content = "\n## 目录结构\n\n" + # 生成实际目录树 + readme_content += "```\n" + readme_content += generate_robot_directory_tree(str(robot_dir), robot_id) + readme_content += "\n```\n\n" + + readme_content += "## 数据集详情\n\n" + + dataset_dir = robot_dir / "dataset" + if not dataset_dir.exists(): + readme_content += "No dataset files available.\n" + else: + # 获取所有文档目录 + doc_dirs = [] + try: + for item in sorted(os.listdir(dataset_dir)): + item_path = dataset_dir / item + if item_path.is_dir(): + doc_dirs.append(item) + except Exception as e: + print(f"Error listing dataset directories: {str(e)}") + + if not doc_dirs: + readme_content += "No document directories found.\n" + else: + # 按源项目分组显示文档 + for project_id, folders in project_groups.items(): + for folder in folders: + folder_name = folder["final_folder_name"] + doc_path = dataset_dir / folder_name + + readme_content += f"#### {folder_name}\n\n" + readme_content += f"**Files:**\n" + + # 检查文件存在性 + document_file = doc_path / "document.txt" + pagination_file = doc_path / "pagination.txt" + embeddings_file = doc_path / "embedding.pkl" + + readme_content += f"- `{folder_name}/document.txt`" + if document_file.exists(): + readme_content += " ✓" + readme_content += "\n" + + readme_content += f"- `{folder_name}/pagination.txt`" + if pagination_file.exists(): + readme_content += " ✓" + readme_content += "\n" + + readme_content += f"- `{folder_name}/embedding.pkl`" + if embeddings_file.exists(): + readme_content += " ✓" + readme_content += "\n\n" + + # 添加文档预览 + if document_file.exists(): + readme_content += f"**内容预览 (前10行):**\n\n```\n" + preview = get_document_preview(str(document_file), 10) + readme_content += preview + readme_content += "\n```\n\n" + else: + readme_content += f"**内容预览:** 不可用\n\n" + + # 显示重命名信息 + original_name = folder["original_folder_name"] + if original_name != folder_name: + readme_content += f"**原始名称:** `{original_name}` → `{folder_name}`\n\n" + + readme_content += "---\n\n" + + + # 写入README文件 + with open(readme_path, 'w', encoding='utf-8') as f: + f.write(readme_content) + + print(f"Generated README: {readme_path}") + return str(readme_path) + + +def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool: + """ + 检查是否需要重建机器人项目 + 1. 检查机器人项目是否存在 + 2. 检查是否有新增的dataset_id + 3. 检查processing_log.json文件是否更新 + + Args: + dataset_ids: 源项目ID列表 + bot_id: 机器人ID + + Returns: + bool: 是否需要重建 + """ + robot_dir = Path("projects") / "robot" / bot_id + + # 如果机器人项目不存在,需要创建 + if not robot_dir.exists(): + print(f"Robot project does not exist, need to create: {bot_id}") + return True + + # 检查机器人项目的配置信息 + config_file = robot_dir / "robot_config.json" + if not config_file.exists(): + print(f"Robot config file not found, need to rebuild: {bot_id}") + return True + + # 读取配置信息 + try: + with open(config_file, 'r', encoding='utf-8') as f: + config = json.load(f) + cached_dataset_ids = set(config.get("dataset_ids", [])) + except Exception as e: + print(f"Error reading robot config: {e}, need to rebuild") + return True + + # 检查dataset_ids是否有变化 + current_dataset_ids = set(dataset_ids) + + # 如果有新增的dataset_id + new_ids = current_dataset_ids - cached_dataset_ids + if new_ids: + print(f"Found new dataset_ids: {new_ids}, need to rebuild") + return True + + # 如果有删除的dataset_id + removed_ids = cached_dataset_ids - current_dataset_ids + if removed_ids: + print(f"Removed dataset_ids: {removed_ids}, need to rebuild") + return True + + # 获取机器人项目的最后修改时间 + robot_mod_time = robot_dir.stat().st_mtime + + # 检查每个源项目的processing_log.json文件 + for source_project_id in dataset_ids: + log_file = Path("projects") / "data" / source_project_id / "processing_log.json" + + if not log_file.exists(): + print(f"Processing log file not found for project {source_project_id}, will rebuild") + return True + + log_mod_time = log_file.stat().st_mtime + + # 如果任何一个processing_log.json文件比机器人项目新,需要重建 + if log_mod_time > robot_mod_time: + print(f"Processing log updated for project {source_project_id}, need to rebuild") + return True + + print(f"Robot project {bot_id} is up to date, no rebuild needed") + return False + + +def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: bool = False) -> str: + """ + 创建机器人项目,合并多个源项目的dataset文件夹 + + Args: + dataset_ids: 源项目ID列表 + bot_id: 机器人ID + force_rebuild: 是否强制重建 + + Returns: + str: 机器人项目目录路径 + """ + print(f"Creating robot project: {bot_id} from sources: {dataset_ids}") + + # 检查是否需要重建 + if not force_rebuild and not should_rebuild_robot_project(dataset_ids, bot_id): + robot_dir = Path("projects") / "robot" / bot_id + print(f"Using existing robot project: {robot_dir}") + return str(robot_dir) + + # 创建机器人目录结构 + robot_dir = Path("projects") / "robot" / bot_id + dataset_dir = robot_dir / "dataset" + + # 清理已存在的目录(如果需要) + if robot_dir.exists(): + print(f"Robot directory already exists, cleaning up: {robot_dir}") + shutil.rmtree(robot_dir) + + robot_dir.mkdir(parents=True, exist_ok=True) + dataset_dir.mkdir(parents=True, exist_ok=True) + + copy_results = [] + + # 遍历每个源项目 + for source_project_id in dataset_ids: + print(f"\nProcessing source project: {source_project_id}") + + source_dataset_dir = Path("projects") / "data" / source_project_id / "dataset" + + if not source_dataset_dir.exists(): + print(f" Warning: Dataset directory not found for project {source_project_id}") + continue + + # 获取所有子文件夹 + folders = [f for f in source_dataset_dir.iterdir() if f.is_dir()] + + if not folders: + print(f" Warning: No folders found in dataset directory for project {source_project_id}") + continue + + # 复制每个文件夹 + for folder in folders: + result = copy_dataset_folder(source_project_id, dataset_dir, folder.name) + copy_results.append(result) + + # 保存配置信息 + config_file = robot_dir / "robot_config.json" + config_data = { + "dataset_ids": dataset_ids, + "created_at": datetime.now().isoformat(), + "total_folders": len(copy_results), + "successful_copies": sum(1 for r in copy_results if r["success"]) + } + + with open(config_file, 'w', encoding='utf-8') as f: + json.dump(config_data, f, ensure_ascii=False, indent=2) + + # 生成README + readme_path = generate_robot_readme(bot_id, dataset_ids, copy_results) + + # 统计信息 + successful_copies = sum(1 for r in copy_results if r["success"]) + print(f"\nRobot project creation completed:") + print(f" Robot directory: {robot_dir}") + print(f" Total folders processed: {len(copy_results)}") + print(f" Successful copies: {successful_copies}") + print(f" Config saved: {config_file}") + print(f" README generated: {readme_path}") + + return str(robot_dir) + + +def get_robot_project_info(bot_id: str) -> Dict: + """ + 获取机器人项目信息 + + Args: + bot_id: 机器人ID + + Returns: + Dict: 机器人项目信息 + """ + robot_dir = Path("projects") / "robot" / bot_id + + if not robot_dir.exists(): + return { + "exists": False, + "bot_id": bot_id, + "error": "Robot project does not exist" + } + + dataset_dir = robot_dir / "dataset" + readme_path = robot_dir / "README.md" + + # 统计文件夹数量 + folder_count = 0 + total_size = 0 + if dataset_dir.exists(): + for item in dataset_dir.iterdir(): + if item.is_dir(): + folder_count += 1 + # 计算文件夹大小 + for file_path in item.rglob('*'): + if file_path.is_file(): + total_size += file_path.stat().st_size + + return { + "exists": True, + "bot_id": bot_id, + "robot_dir": str(robot_dir), + "dataset_dir": str(dataset_dir), + "readme_exists": readme_path.exists(), + "folder_count": folder_count, + "total_size_bytes": total_size, + "total_size_mb": round(total_size / (1024 * 1024), 2) + } + + +def cleanup_robot_project(bot_id: str) -> bool: + """ + 清理机器人项目 + + Args: + bot_id: 机器人ID + + Returns: + bool: 清理是否成功 + """ + try: + robot_dir = Path("projects") / "robot" / bot_id + + if robot_dir.exists(): + shutil.rmtree(robot_dir) + print(f"Cleaned up robot project: {bot_id}") + return True + else: + print(f"Robot project does not exist: {bot_id}") + return True + + except Exception as e: + print(f"Error cleaning up robot project {bot_id}: {str(e)}") + return False + + +if __name__ == "__main__": + # 测试代码 + test_dataset_ids = ["test-project-1", "test-project-2"] + test_bot_id = "test-robot-001" + + robot_dir = create_robot_project(test_dataset_ids, test_bot_id) + print(f"Created robot project at: {robot_dir}") + + info = get_robot_project_info(test_bot_id) + print(f"Robot project info: {json.dumps(info, indent=2, ensure_ascii=False)}") diff --git a/utils/prompt_loader.py b/utils/prompt_loader.py index a8040b2..9b2a062 100644 --- a/utils/prompt_loader.py +++ b/utils/prompt_loader.py @@ -7,7 +7,7 @@ import json from typing import List, Dict, Optional -def load_system_prompt(project_dir: str, language: str = None, system_prompt: str=None, robot_type: str = "agent", unique_id: str="") -> str: +def load_system_prompt(project_dir: str, language: str = None, system_prompt: str=None, robot_type: str = "agent", bot_id: str="") -> str: # 获取语言显示名称 language_display_map = { 'zh': '中文', @@ -19,7 +19,7 @@ def load_system_prompt(project_dir: str, language: str = None, system_prompt: st # 如果存在{language} 占位符,那么就直接使用 system_prompt if system_prompt and "{language}" in system_prompt: - return system_prompt.replace("{language}", language_display).replace('{unique_id}', unique_id) or "" + return system_prompt.replace("{language}", language_display).replace('{bot_id}', bot_id) or "" elif robot_type == "agent" or robot_type == "catalog_agent": """ 优先使用项目目录的system_prompt_catalog_agent.md,没有才使用默认的system_prompt_default.md @@ -51,13 +51,13 @@ def load_system_prompt(project_dir: str, language: str = None, system_prompt: st readme = f.read().strip() system_prompt_default = system_prompt_default.replace("{readme}", str(readme)) - return system_prompt_default.replace("{language}", language_display).replace("{extra_prompt}", system_prompt or "").replace('{unique_id}', unique_id) or "" + return system_prompt_default.replace("{language}", language_display).replace("{extra_prompt}", system_prompt or "").replace('{bot_id}', bot_id) or "" else: - return system_prompt.replace("{language}", language_display).replace('{unique_id}', unique_id) or "" + return system_prompt.replace("{language}", language_display).replace('{bot_id}', bot_id) or "" -def replace_mcp_placeholders(mcp_settings: List[Dict], dataset_dir: str, unique_id: str) -> List[Dict]: +def replace_mcp_placeholders(mcp_settings: List[Dict], dataset_dir: str, bot_id: str) -> List[Dict]: """ 替换 MCP 配置中的占位符 """ @@ -70,21 +70,21 @@ def replace_mcp_placeholders(mcp_settings: List[Dict], dataset_dir: str, unique_ for key, value in obj.items(): if key == 'args' and isinstance(value, list): # 特别处理 args 列表 - obj[key] = [item.replace('{dataset_dir}', dataset_dir).replace('{unique_id}', unique_id) if isinstance(item, str) else item + obj[key] = [item.replace('{dataset_dir}', dataset_dir).replace('{bot_id}', bot_id) if isinstance(item, str) else item for item in value] elif isinstance(value, (dict, list)): obj[key] = replace_placeholders_in_obj(value) elif isinstance(value, str): - obj[key] = value.replace('{dataset_dir}', dataset_dir).replace('{unique_id}', unique_id) + obj[key] = value.replace('{dataset_dir}', dataset_dir).replace('{bot_id}', bot_id) elif isinstance(obj, list): return [replace_placeholders_in_obj(item) if isinstance(item, (dict, list)) else - item.replace('{dataset_dir}', dataset_dir).replace('{unique_id}', unique_id) if isinstance(item, str) else item + item.replace('{dataset_dir}', dataset_dir).replace('{bot_id}', bot_id) if isinstance(item, str) else item for item in obj] return obj return replace_placeholders_in_obj(mcp_settings) -def load_mcp_settings(project_dir: str, mcp_settings: list=None, unique_id: str="", robot_type: str = "agent") -> List[Dict]: +def load_mcp_settings(project_dir: str, mcp_settings: list=None, bot_id: str="", robot_type: str = "agent") -> List[Dict]: """ 始终读取默认MCP设置,然后与传入的mcp_settings合并,合并方式为合并[0].mcpServers对象 @@ -92,7 +92,7 @@ def load_mcp_settings(project_dir: str, mcp_settings: list=None, unique_id: str= Args: project_dir: 项目目录路径 mcp_settings: 可选的MCP设置,将与默认设置合并 - unique_id: 唯一标识符 + bot_id: 机器人项目ID robot_type: 机器人类型,取值 agent/catalog_agent Returns: @@ -156,6 +156,6 @@ def load_mcp_settings(project_dir: str, mcp_settings: list=None, unique_id: str= # 计算 dataset_dir 用于替换 MCP 配置中的占位符 dataset_dir = os.path.join(project_dir, "dataset") # 替换 MCP 配置中的 {dataset_dir} 占位符 - merged_settings = replace_mcp_placeholders(merged_settings, dataset_dir, unique_id) + merged_settings = replace_mcp_placeholders(merged_settings, dataset_dir, bot_id) return merged_settings