add symlink
This commit is contained in:
parent
61c6b69aa5
commit
d3465d418e
@ -98,9 +98,7 @@ async def load_system_prompt_async(project_dir: str, language: str = None, syste
|
|||||||
datetime_str = format_datetime_by_language(language) if language else format_datetime_by_language('en')
|
datetime_str = format_datetime_by_language(language) if language else format_datetime_by_language('en')
|
||||||
|
|
||||||
# 如果存在{language} 占位符,那么就直接使用 system_prompt
|
# 如果存在{language} 占位符,那么就直接使用 system_prompt
|
||||||
if robot_type == "deep_agent":
|
if robot_type == "general_agent" or robot_type == "catalog_agent" or robot_type == "deep_agent":
|
||||||
return None
|
|
||||||
elif robot_type == "general_agent" or robot_type == "catalog_agent":
|
|
||||||
"""
|
"""
|
||||||
优先使用项目目录的README.md,没有才使用默认的system_prompt_{robot_type}.md
|
优先使用项目目录的README.md,没有才使用默认的system_prompt_{robot_type}.md
|
||||||
"""
|
"""
|
||||||
@ -122,7 +120,15 @@ async def load_system_prompt_async(project_dir: str, language: str = None, syste
|
|||||||
readme_path = os.path.join(project_dir, "README.md")
|
readme_path = os.path.join(project_dir, "README.md")
|
||||||
readme = await config_cache.get_text_file(readme_path) or ""
|
readme = await config_cache.get_text_file(readme_path) or ""
|
||||||
|
|
||||||
prompt = system_prompt_default.format(readme=str(readme), extra_prompt=system_prompt or "",language=language_display, user_identifier=user_identifier, datetime=datetime_str)
|
agent_dir_path = f"~/.deepagents/{bot_id}" #agent_dir_path 其实映射的就是 project_dir目录,只是给ai看的目录路径
|
||||||
|
prompt = system_prompt_default.format(
|
||||||
|
readme=str(readme),
|
||||||
|
extra_prompt=system_prompt or "",
|
||||||
|
language=language_display,
|
||||||
|
user_identifier=user_identifier,
|
||||||
|
datetime=datetime_str,
|
||||||
|
agent_dir_path=agent_dir_path
|
||||||
|
)
|
||||||
elif system_prompt:
|
elif system_prompt:
|
||||||
prompt = system_prompt.format(language=language_display, user_identifier=user_identifier, datetime=datetime_str)
|
prompt = system_prompt.format(language=language_display, user_identifier=user_identifier, datetime=datetime_str)
|
||||||
return prompt or ""
|
return prompt or ""
|
||||||
|
|||||||
62
prompt/system_prompt_deep_agent.md
Normal file
62
prompt/system_prompt_deep_agent.md
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
<env>
|
||||||
|
Working directory: {agent_dir_path}
|
||||||
|
</env>
|
||||||
|
|
||||||
|
### Current Working Directory
|
||||||
|
|
||||||
|
The filesystem backend is currently operating in: `{agent_dir_path}`
|
||||||
|
|
||||||
|
### File System and Paths
|
||||||
|
|
||||||
|
**IMPORTANT - Path Handling:**
|
||||||
|
- All file paths must be absolute paths (e.g., `{agent_dir_path}/file.txt`)
|
||||||
|
- Use the working directory from <env> to construct absolute paths
|
||||||
|
- Example: To create a file in your working directory, use `{agent_dir_path}/research_project/file.md`
|
||||||
|
- Never use relative paths - always construct full absolute paths
|
||||||
|
|
||||||
|
### Skills Directory
|
||||||
|
|
||||||
|
Your skills are stored at: `{agent_dir_path}/skills/`
|
||||||
|
Skills may contain scripts or supporting files. When executing skill scripts with bash, use the real filesystem path:
|
||||||
|
Example: `bash python {agent_dir_path}/skills/web-research/script.py`
|
||||||
|
|
||||||
|
### Human-in-the-Loop Tool Approval
|
||||||
|
|
||||||
|
Some tool calls require user approval before execution. When a tool call is rejected by the user:
|
||||||
|
1. Accept their decision immediately - do NOT retry the same command
|
||||||
|
2. Explain that you understand they rejected the action
|
||||||
|
3. Suggest an alternative approach or ask for clarification
|
||||||
|
4. Never attempt the exact same rejected command again
|
||||||
|
|
||||||
|
Respect the user's decisions and work with them collaboratively.
|
||||||
|
|
||||||
|
### Web Search Tool Usage
|
||||||
|
|
||||||
|
When you use the web_search tool:
|
||||||
|
1. The tool will return search results with titles, URLs, and content excerpts
|
||||||
|
2. You MUST read and process these results, then respond naturally to the user
|
||||||
|
3. NEVER show raw JSON or tool results directly to the user
|
||||||
|
4. Synthesize the information from multiple sources into a coherent answer
|
||||||
|
5. Cite your sources by mentioning page titles or URLs when relevant
|
||||||
|
6. If the search doesn't find what you need, explain what you found and ask clarifying questions
|
||||||
|
|
||||||
|
The user only sees your text responses - not tool results. Always provide a complete, natural language answer after using web_search.
|
||||||
|
|
||||||
|
### Todo List Management
|
||||||
|
|
||||||
|
When using the write_todos tool:
|
||||||
|
1. Keep the todo list MINIMAL - aim for 3-6 items maximum
|
||||||
|
2. Only create todos for complex, multi-step tasks that truly need tracking
|
||||||
|
3. Break down work into clear, actionable items without over-fragmenting
|
||||||
|
4. For simple tasks (1-2 steps), just do them directly without creating todos
|
||||||
|
5. When first creating a todo list for a task, ALWAYS ask the user if the plan looks good before starting work
|
||||||
|
- Create the todos, let them render, then ask: "Does this plan look good?" or similar
|
||||||
|
- Wait for the user's response before marking the first todo as in_progress
|
||||||
|
- If they want changes, adjust the plan accordingly
|
||||||
|
6. Update todo status promptly as you complete each item
|
||||||
|
|
||||||
|
The todo list is a planning tool - use it judiciously to avoid overwhelming the user with excessive task tracking.
|
||||||
|
|
||||||
|
## System Information
|
||||||
|
- **Current User**: {user_identifier}
|
||||||
|
- **Current Time**: {datetime}
|
||||||
@ -19,7 +19,6 @@ from .config import huey
|
|||||||
from utils.file_utils import (
|
from utils.file_utils import (
|
||||||
extract_zip_file,
|
extract_zip_file,
|
||||||
get_file_hash,
|
get_file_hash,
|
||||||
is_file_already_processed,
|
|
||||||
load_processed_files_log,
|
load_processed_files_log,
|
||||||
save_processed_files_log,
|
save_processed_files_log,
|
||||||
get_document_preview
|
get_document_preview
|
||||||
|
|||||||
@ -9,7 +9,6 @@ from .file_utils import (
|
|||||||
remove_file_or_directory,
|
remove_file_or_directory,
|
||||||
extract_zip_file,
|
extract_zip_file,
|
||||||
get_document_preview,
|
get_document_preview,
|
||||||
is_file_already_processed,
|
|
||||||
load_processed_files_log,
|
load_processed_files_log,
|
||||||
save_processed_files_log
|
save_processed_files_log
|
||||||
)
|
)
|
||||||
@ -44,11 +43,6 @@ from .agent_pool import (
|
|||||||
release_agent_to_pool
|
release_agent_to_pool
|
||||||
)
|
)
|
||||||
|
|
||||||
from .organize_dataset_files import (
|
|
||||||
is_file_already_processed,
|
|
||||||
organize_single_project_files,
|
|
||||||
organize_dataset_files
|
|
||||||
)
|
|
||||||
|
|
||||||
from .api_models import (
|
from .api_models import (
|
||||||
Message,
|
Message,
|
||||||
@ -77,8 +71,6 @@ from .api_models import (
|
|||||||
|
|
||||||
from .multi_project_manager import (
|
from .multi_project_manager import (
|
||||||
create_robot_project,
|
create_robot_project,
|
||||||
get_robot_project_info,
|
|
||||||
cleanup_robot_project,
|
|
||||||
get_unique_folder_name,
|
get_unique_folder_name,
|
||||||
copy_dataset_folder,
|
copy_dataset_folder,
|
||||||
generate_robot_readme
|
generate_robot_readme
|
||||||
@ -96,7 +88,6 @@ __all__ = [
|
|||||||
'remove_file_or_directory',
|
'remove_file_or_directory',
|
||||||
'extract_zip_file',
|
'extract_zip_file',
|
||||||
'get_document_preview',
|
'get_document_preview',
|
||||||
'is_file_already_processed',
|
|
||||||
'load_processed_files_log',
|
'load_processed_files_log',
|
||||||
'save_processed_files_log',
|
'save_processed_files_log',
|
||||||
|
|
||||||
@ -122,10 +113,6 @@ __all__ = [
|
|||||||
'get_agent_from_pool',
|
'get_agent_from_pool',
|
||||||
'release_agent_to_pool',
|
'release_agent_to_pool',
|
||||||
|
|
||||||
# organize_dataset_files
|
|
||||||
'is_file_already_processed',
|
|
||||||
'organize_single_project_files',
|
|
||||||
'organize_dataset_files',
|
|
||||||
|
|
||||||
# api_models
|
# api_models
|
||||||
'Message',
|
'Message',
|
||||||
@ -152,8 +139,6 @@ __all__ = [
|
|||||||
|
|
||||||
# multi_project_manager
|
# multi_project_manager
|
||||||
'create_robot_project',
|
'create_robot_project',
|
||||||
'get_robot_project_info',
|
|
||||||
'cleanup_robot_project',
|
|
||||||
'get_unique_folder_name',
|
'get_unique_folder_name',
|
||||||
'copy_dataset_folder',
|
'copy_dataset_folder',
|
||||||
'generate_robot_readme',
|
'generate_robot_readme',
|
||||||
|
|||||||
@ -373,7 +373,8 @@ def create_project_directory(dataset_ids: Optional[List[str]], bot_id: str, robo
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
from utils.multi_project_manager import create_robot_project
|
from utils.multi_project_manager import create_robot_project
|
||||||
return create_robot_project(dataset_ids, bot_id)
|
from pathlib import Path
|
||||||
|
return create_robot_project(dataset_ids, bot_id, Path("~", ".deepagents"))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error creating project directory: {e}")
|
logger.error(f"Error creating project directory: {e}")
|
||||||
return None
|
return None
|
||||||
|
|||||||
@ -91,18 +91,6 @@ def get_document_preview(document_path: str, max_lines: int = 10) -> str:
|
|||||||
return f"Error reading document: {str(e)}"
|
return f"Error reading document: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
|
|
||||||
"""Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
|
|
||||||
if not target_file.exists():
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check if pagination and embeddings files exist and are not empty
|
|
||||||
if pagination_file.exists() and embeddings_file.exists():
|
|
||||||
# Check file sizes to ensure they're not empty
|
|
||||||
if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
|
def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
|
||||||
|
|||||||
@ -141,7 +141,7 @@ def get_unique_folder_name(target_dir: Path, original_name: str) -> str:
|
|||||||
counter += 1
|
counter += 1
|
||||||
|
|
||||||
|
|
||||||
def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder_name: str) -> Dict:
|
def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder_name: str, project_path: Path) -> Dict:
|
||||||
"""
|
"""
|
||||||
复制单个项目的dataset文件夹到目标目录
|
复制单个项目的dataset文件夹到目标目录
|
||||||
|
|
||||||
@ -149,6 +149,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
|
|||||||
source_project_id: 源项目ID
|
source_project_id: 源项目ID
|
||||||
target_dataset_dir: 目标dataset目录
|
target_dataset_dir: 目标dataset目录
|
||||||
folder_name: 要复制的文件夹名称
|
folder_name: 要复制的文件夹名称
|
||||||
|
project_path: 项目路径
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict: 复制结果
|
Dict: 复制结果
|
||||||
@ -163,7 +164,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
|
|||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
source_folder = Path("projects") / "data" / source_project_id / "dataset" / folder_name
|
source_folder = project_path / "data" / source_project_id / "dataset" / folder_name
|
||||||
result["source_path"] = str(source_folder)
|
result["source_path"] = str(source_folder)
|
||||||
|
|
||||||
if not source_folder.exists():
|
if not source_folder.exists():
|
||||||
@ -190,7 +191,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: List[Dict]) -> str:
|
def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: List[Dict], project_path: Path) -> str:
|
||||||
"""
|
"""
|
||||||
生成机器人项目的README.md文件
|
生成机器人项目的README.md文件
|
||||||
|
|
||||||
@ -202,10 +203,10 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L
|
|||||||
Returns:
|
Returns:
|
||||||
str: README.md文件路径
|
str: README.md文件路径
|
||||||
"""
|
"""
|
||||||
readme_path = Path("projects") / "robot" / robot_id / "README.md"
|
readme_path = project_path / "robot" / robot_id / "README.md"
|
||||||
readme_path.parent.mkdir(parents=True, exist_ok=True)
|
readme_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
robot_dir = Path("projects") / "robot" / robot_id
|
robot_dir = project_path / "robot" / robot_id
|
||||||
|
|
||||||
# 统计信息
|
# 统计信息
|
||||||
total_folders = len(copy_results)
|
total_folders = len(copy_results)
|
||||||
@ -300,7 +301,7 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L
|
|||||||
return str(readme_path)
|
return str(readme_path)
|
||||||
|
|
||||||
|
|
||||||
def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
|
def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str, project_path: Path) -> bool:
|
||||||
"""
|
"""
|
||||||
检查是否需要重建机器人项目
|
检查是否需要重建机器人项目
|
||||||
1. 检查机器人项目是否存在
|
1. 检查机器人项目是否存在
|
||||||
@ -310,11 +311,12 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
|
|||||||
Args:
|
Args:
|
||||||
dataset_ids: 源项目ID列表
|
dataset_ids: 源项目ID列表
|
||||||
bot_id: 机器人ID
|
bot_id: 机器人ID
|
||||||
|
project_path: 项目路径
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: 是否需要重建
|
bool: 是否需要重建
|
||||||
"""
|
"""
|
||||||
robot_dir = Path("projects") / "robot" / bot_id
|
robot_dir = project_path / "robot" / bot_id
|
||||||
|
|
||||||
# 如果机器人项目不存在,需要创建
|
# 如果机器人项目不存在,需要创建
|
||||||
if not robot_dir.exists():
|
if not robot_dir.exists():
|
||||||
@ -356,7 +358,7 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
|
|||||||
|
|
||||||
# 检查每个源项目的processing_log.json文件
|
# 检查每个源项目的processing_log.json文件
|
||||||
for source_project_id in dataset_ids:
|
for source_project_id in dataset_ids:
|
||||||
log_file = Path("projects") / "data" / source_project_id / "processing_log.json"
|
log_file = project_path / "data" / source_project_id / "processing_log.json"
|
||||||
|
|
||||||
if not log_file.exists():
|
if not log_file.exists():
|
||||||
logger.info(f"Processing log file not found for project {source_project_id}, will rebuild")
|
logger.info(f"Processing log file not found for project {source_project_id}, will rebuild")
|
||||||
@ -373,7 +375,7 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: bool = False) -> str:
|
def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: bool = False, project_path: Path = Path("projects")) -> str:
|
||||||
"""
|
"""
|
||||||
创建机器人项目,合并多个源项目的dataset文件夹
|
创建机器人项目,合并多个源项目的dataset文件夹
|
||||||
|
|
||||||
@ -386,15 +388,15 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
|||||||
str: 机器人项目目录路径
|
str: 机器人项目目录路径
|
||||||
"""
|
"""
|
||||||
logger.info(f"Creating robot project: {bot_id} from sources: {dataset_ids}")
|
logger.info(f"Creating robot project: {bot_id} from sources: {dataset_ids}")
|
||||||
|
|
||||||
# 检查是否需要重建
|
# 检查是否需要重建
|
||||||
if not force_rebuild and not should_rebuild_robot_project(dataset_ids, bot_id):
|
if not force_rebuild and not should_rebuild_robot_project(dataset_ids, bot_id, project_path):
|
||||||
robot_dir = Path("projects") / "robot" / bot_id
|
robot_dir = project_path / "robot" / bot_id
|
||||||
logger.info(f"Using existing robot project: {robot_dir}")
|
logger.info(f"Using existing robot project: {robot_dir}")
|
||||||
return str(robot_dir)
|
return str(robot_dir)
|
||||||
|
|
||||||
# 创建机器人目录结构
|
# 创建机器人目录结构
|
||||||
robot_dir = Path("projects") / "robot" / bot_id
|
robot_dir = project_path / "robot" / bot_id
|
||||||
dataset_dir = robot_dir / "dataset"
|
dataset_dir = robot_dir / "dataset"
|
||||||
|
|
||||||
# 清理已存在的目录(如果需要)
|
# 清理已存在的目录(如果需要)
|
||||||
@ -411,7 +413,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
|||||||
for source_project_id in dataset_ids:
|
for source_project_id in dataset_ids:
|
||||||
logger.info(f"\nProcessing source project: {source_project_id}")
|
logger.info(f"\nProcessing source project: {source_project_id}")
|
||||||
|
|
||||||
source_dataset_dir = Path("projects") / "data" / source_project_id / "dataset"
|
source_dataset_dir = project_path / "data" / source_project_id / "dataset"
|
||||||
|
|
||||||
if not source_dataset_dir.exists():
|
if not source_dataset_dir.exists():
|
||||||
logger.warning(f" Warning: Dataset directory not found for project {source_project_id}")
|
logger.warning(f" Warning: Dataset directory not found for project {source_project_id}")
|
||||||
@ -426,7 +428,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
|||||||
|
|
||||||
# 复制每个文件夹
|
# 复制每个文件夹
|
||||||
for folder in folders:
|
for folder in folders:
|
||||||
result = copy_dataset_folder(source_project_id, dataset_dir, folder.name)
|
result = copy_dataset_folder(source_project_id, dataset_dir, folder.name, project_path)
|
||||||
copy_results.append(result)
|
copy_results.append(result)
|
||||||
|
|
||||||
# 保存配置信息
|
# 保存配置信息
|
||||||
@ -442,7 +444,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
|||||||
json.dump(config_data, f, ensure_ascii=False, indent=2)
|
json.dump(config_data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
# 生成README
|
# 生成README
|
||||||
readme_path = generate_robot_readme(bot_id, dataset_ids, copy_results)
|
readme_path = generate_robot_readme(bot_id, dataset_ids, copy_results, project_path)
|
||||||
|
|
||||||
# 统计信息
|
# 统计信息
|
||||||
successful_copies = sum(1 for r in copy_results if r["success"])
|
successful_copies = sum(1 for r in copy_results if r["success"])
|
||||||
@ -456,78 +458,6 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
|||||||
return str(robot_dir)
|
return str(robot_dir)
|
||||||
|
|
||||||
|
|
||||||
def get_robot_project_info(bot_id: str) -> Dict:
|
|
||||||
"""
|
|
||||||
获取机器人项目信息
|
|
||||||
|
|
||||||
Args:
|
|
||||||
bot_id: 机器人ID
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dict: 机器人项目信息
|
|
||||||
"""
|
|
||||||
robot_dir = Path("projects") / "robot" / bot_id
|
|
||||||
|
|
||||||
if not robot_dir.exists():
|
|
||||||
return {
|
|
||||||
"exists": False,
|
|
||||||
"bot_id": bot_id,
|
|
||||||
"error": "Robot project does not exist"
|
|
||||||
}
|
|
||||||
|
|
||||||
dataset_dir = robot_dir / "dataset"
|
|
||||||
readme_path = robot_dir / "README.md"
|
|
||||||
|
|
||||||
# 统计文件夹数量
|
|
||||||
folder_count = 0
|
|
||||||
total_size = 0
|
|
||||||
if dataset_dir.exists():
|
|
||||||
for item in dataset_dir.iterdir():
|
|
||||||
if item.is_dir():
|
|
||||||
folder_count += 1
|
|
||||||
# 计算文件夹大小
|
|
||||||
for file_path in item.rglob('*'):
|
|
||||||
if file_path.is_file():
|
|
||||||
total_size += file_path.stat().st_size
|
|
||||||
|
|
||||||
return {
|
|
||||||
"exists": True,
|
|
||||||
"bot_id": bot_id,
|
|
||||||
"robot_dir": str(robot_dir),
|
|
||||||
"dataset_dir": str(dataset_dir),
|
|
||||||
"readme_exists": readme_path.exists(),
|
|
||||||
"folder_count": folder_count,
|
|
||||||
"total_size_bytes": total_size,
|
|
||||||
"total_size_mb": round(total_size / (1024 * 1024), 2)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup_robot_project(bot_id: str) -> bool:
|
|
||||||
"""
|
|
||||||
清理机器人项目
|
|
||||||
|
|
||||||
Args:
|
|
||||||
bot_id: 机器人ID
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: 清理是否成功
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
robot_dir = Path("projects") / "robot" / bot_id
|
|
||||||
|
|
||||||
if robot_dir.exists():
|
|
||||||
shutil.rmtree(robot_dir)
|
|
||||||
logger.info(f"Cleaned up robot project: {bot_id}")
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
logger.info(f"Robot project does not exist: {bot_id}")
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error cleaning up robot project {bot_id}: {str(e)}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# 测试代码
|
# 测试代码
|
||||||
test_dataset_ids = ["test-project-1", "test-project-2"]
|
test_dataset_ids = ["test-project-1", "test-project-2"]
|
||||||
@ -536,5 +466,3 @@ if __name__ == "__main__":
|
|||||||
robot_dir = create_robot_project(test_dataset_ids, test_bot_id)
|
robot_dir = create_robot_project(test_dataset_ids, test_bot_id)
|
||||||
logger.info(f"Created robot project at: {robot_dir}")
|
logger.info(f"Created robot project at: {robot_dir}")
|
||||||
|
|
||||||
info = get_robot_project_info(test_bot_id)
|
|
||||||
logger.info(f"Robot project info: {json.dumps(info, indent=2, ensure_ascii=False)}")
|
|
||||||
|
|||||||
@ -1,179 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# 配置日志
|
|
||||||
logger = logging.getLogger('app')
|
|
||||||
|
|
||||||
def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
|
|
||||||
"""Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
|
|
||||||
if not target_file.exists():
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check if pagination and embeddings files exist and are not empty
|
|
||||||
if pagination_file.exists() and embeddings_file.exists():
|
|
||||||
# Check file sizes to ensure they're not empty
|
|
||||||
if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def organize_single_project_files(unique_id: str, skip_processed=True):
|
|
||||||
"""Organize files for a single project from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
|
|
||||||
|
|
||||||
project_dir = Path("projects") / "data" / unique_id
|
|
||||||
|
|
||||||
if not project_dir.exists():
|
|
||||||
logger.error(f"Project directory not found: {project_dir}")
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.info(f"Organizing files for project: {unique_id} (skip_processed={skip_processed})")
|
|
||||||
|
|
||||||
files_dir = project_dir / "files"
|
|
||||||
dataset_dir = project_dir / "dataset"
|
|
||||||
|
|
||||||
# Check if files directory exists and has files
|
|
||||||
if not files_dir.exists():
|
|
||||||
logger.info(f" No files directory found, skipping...")
|
|
||||||
return
|
|
||||||
|
|
||||||
files = list(files_dir.glob("*"))
|
|
||||||
if not files:
|
|
||||||
logger.info(f" Files directory is empty, skipping...")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Create dataset directory if it doesn't exist
|
|
||||||
dataset_dir.mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
# Copy each file to its own directory
|
|
||||||
for file_path in files:
|
|
||||||
if file_path.is_file():
|
|
||||||
# Get filename without extension as directory name
|
|
||||||
file_name_without_ext = file_path.stem
|
|
||||||
target_dir = dataset_dir / file_name_without_ext
|
|
||||||
target_file = target_dir / "document.txt"
|
|
||||||
pagination_file = target_dir / "pagination.txt"
|
|
||||||
embeddings_file = target_dir / "embedding.pkl"
|
|
||||||
|
|
||||||
# Check if file is already processed
|
|
||||||
if skip_processed and is_file_already_processed(target_file, pagination_file, embeddings_file):
|
|
||||||
logger.info(f" Skipping already processed file: {file_path.name}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
|
|
||||||
|
|
||||||
# Create target directory
|
|
||||||
target_dir.mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
# Copy and rename file
|
|
||||||
shutil.copy2(str(file_path), str(target_file))
|
|
||||||
|
|
||||||
print(f" Files remain in original location (copied to dataset structure)")
|
|
||||||
|
|
||||||
# Process each document.txt file: split pages and generate embeddings
|
|
||||||
if not skip_processed:
|
|
||||||
import sys
|
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), 'embedding'))
|
|
||||||
|
|
||||||
from embedding import split_document_by_pages, embed_document
|
|
||||||
|
|
||||||
for file_path in files:
|
|
||||||
if file_path.is_file():
|
|
||||||
file_name_without_ext = file_path.stem
|
|
||||||
target_dir = dataset_dir / file_name_without_ext
|
|
||||||
document_file = target_dir / "document.txt"
|
|
||||||
pagination_file = target_dir / "pagination.txt"
|
|
||||||
embeddings_file = target_dir / "embedding.pkl"
|
|
||||||
|
|
||||||
# Skip if already processed
|
|
||||||
if is_file_already_processed(document_file, pagination_file, embeddings_file):
|
|
||||||
print(f" Skipping document processing for already processed file: {file_path.name}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Split document by pages
|
|
||||||
print(f" Splitting pages for {document_file.name}")
|
|
||||||
try:
|
|
||||||
pages = split_document_by_pages(str(document_file), str(pagination_file))
|
|
||||||
print(f" Generated {len(pages)} pages")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" Failed to split pages: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Generate embeddings
|
|
||||||
print(f" Generating embeddings for {document_file.name}")
|
|
||||||
try:
|
|
||||||
# Use paragraph chunking strategy with default settings
|
|
||||||
embedding_data = embed_document(
|
|
||||||
str(document_file),
|
|
||||||
str(embeddings_file),
|
|
||||||
chunking_strategy='paragraph'
|
|
||||||
)
|
|
||||||
|
|
||||||
if embedding_data:
|
|
||||||
print(f" Generated embeddings for {len(embedding_data['chunks'])} chunks")
|
|
||||||
else:
|
|
||||||
print(f" Failed to generate embeddings")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" Failed to generate embeddings: {e}")
|
|
||||||
|
|
||||||
print(f" Document processing completed for project {unique_id}")
|
|
||||||
else:
|
|
||||||
print(f" Skipping document processing (skip_processed=True)")
|
|
||||||
|
|
||||||
|
|
||||||
def organize_dataset_files():
|
|
||||||
"""Move files from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
|
|
||||||
|
|
||||||
projects_dir = Path("projects") / "data"
|
|
||||||
|
|
||||||
if not projects_dir.exists():
|
|
||||||
print("Projects directory not found")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Get all project directories (exclude cache and other non-project dirs)
|
|
||||||
project_dirs = [d for d in projects_dir.iterdir()
|
|
||||||
if d.is_dir() and d.name != "_cache" and not d.name.startswith(".")]
|
|
||||||
|
|
||||||
for project_dir in project_dirs:
|
|
||||||
print(f"\nProcessing project: {project_dir.name}")
|
|
||||||
|
|
||||||
files_dir = project_dir / "files"
|
|
||||||
dataset_dir = project_dir / "dataset"
|
|
||||||
|
|
||||||
# Check if files directory exists and has files
|
|
||||||
if not files_dir.exists():
|
|
||||||
logger.info(f" No files directory found, skipping...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
files = list(files_dir.glob("*"))
|
|
||||||
if not files:
|
|
||||||
logger.info(f" Files directory is empty, skipping...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Create dataset directory if it doesn't exist
|
|
||||||
dataset_dir.mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
# Move each file to its own directory
|
|
||||||
for file_path in files:
|
|
||||||
if file_path.is_file():
|
|
||||||
# Get filename without extension as directory name
|
|
||||||
file_name_without_ext = file_path.stem
|
|
||||||
target_dir = dataset_dir / file_name_without_ext
|
|
||||||
target_file = target_dir / "document.txt"
|
|
||||||
|
|
||||||
logger.info(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
|
|
||||||
|
|
||||||
# Create target directory
|
|
||||||
target_dir.mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
# Copy and rename file
|
|
||||||
shutil.copy2(str(file_path), str(target_file))
|
|
||||||
|
|
||||||
print(f" Files remain in original location (copied to dataset structure)")
|
|
||||||
|
|
||||||
print("\nFile organization complete!")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
organize_dataset_files()
|
|
||||||
@ -54,6 +54,17 @@ def setup_deepagents_symlink():
|
|||||||
logger.info(f"Removed existing symlink pointing to {target}")
|
logger.info(f"Removed existing symlink pointing to {target}")
|
||||||
|
|
||||||
# Create the symbolic link
|
# Create the symbolic link
|
||||||
|
# Check again before creating to handle race conditions
|
||||||
|
if deepagents_dir.is_symlink() or deepagents_dir.exists():
|
||||||
|
logger.warning(f"Path {deepagents_dir} exists, attempting to remove before symlink")
|
||||||
|
if deepagents_dir.is_symlink():
|
||||||
|
deepagents_dir.unlink()
|
||||||
|
elif deepagents_dir.is_dir():
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree(str(deepagents_dir))
|
||||||
|
else:
|
||||||
|
deepagents_dir.unlink()
|
||||||
|
|
||||||
os.symlink(robot_dir, deepagents_dir, target_is_directory=True)
|
os.symlink(robot_dir, deepagents_dir, target_is_directory=True)
|
||||||
logger.info(f"Created symbolic link: {deepagents_dir} -> {robot_dir}")
|
logger.info(f"Created symbolic link: {deepagents_dir} -> {robot_dir}")
|
||||||
return True
|
return True
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user