add symlink
This commit is contained in:
parent
61c6b69aa5
commit
d3465d418e
@ -98,9 +98,7 @@ async def load_system_prompt_async(project_dir: str, language: str = None, syste
|
||||
datetime_str = format_datetime_by_language(language) if language else format_datetime_by_language('en')
|
||||
|
||||
# 如果存在{language} 占位符,那么就直接使用 system_prompt
|
||||
if robot_type == "deep_agent":
|
||||
return None
|
||||
elif robot_type == "general_agent" or robot_type == "catalog_agent":
|
||||
if robot_type == "general_agent" or robot_type == "catalog_agent" or robot_type == "deep_agent":
|
||||
"""
|
||||
优先使用项目目录的README.md,没有才使用默认的system_prompt_{robot_type}.md
|
||||
"""
|
||||
@ -122,7 +120,15 @@ async def load_system_prompt_async(project_dir: str, language: str = None, syste
|
||||
readme_path = os.path.join(project_dir, "README.md")
|
||||
readme = await config_cache.get_text_file(readme_path) or ""
|
||||
|
||||
prompt = system_prompt_default.format(readme=str(readme), extra_prompt=system_prompt or "",language=language_display, user_identifier=user_identifier, datetime=datetime_str)
|
||||
agent_dir_path = f"~/.deepagents/{bot_id}" #agent_dir_path 其实映射的就是 project_dir目录,只是给ai看的目录路径
|
||||
prompt = system_prompt_default.format(
|
||||
readme=str(readme),
|
||||
extra_prompt=system_prompt or "",
|
||||
language=language_display,
|
||||
user_identifier=user_identifier,
|
||||
datetime=datetime_str,
|
||||
agent_dir_path=agent_dir_path
|
||||
)
|
||||
elif system_prompt:
|
||||
prompt = system_prompt.format(language=language_display, user_identifier=user_identifier, datetime=datetime_str)
|
||||
return prompt or ""
|
||||
|
||||
62
prompt/system_prompt_deep_agent.md
Normal file
62
prompt/system_prompt_deep_agent.md
Normal file
@ -0,0 +1,62 @@
|
||||
<env>
|
||||
Working directory: {agent_dir_path}
|
||||
</env>
|
||||
|
||||
### Current Working Directory
|
||||
|
||||
The filesystem backend is currently operating in: `{agent_dir_path}`
|
||||
|
||||
### File System and Paths
|
||||
|
||||
**IMPORTANT - Path Handling:**
|
||||
- All file paths must be absolute paths (e.g., `{agent_dir_path}/file.txt`)
|
||||
- Use the working directory from <env> to construct absolute paths
|
||||
- Example: To create a file in your working directory, use `{agent_dir_path}/research_project/file.md`
|
||||
- Never use relative paths - always construct full absolute paths
|
||||
|
||||
### Skills Directory
|
||||
|
||||
Your skills are stored at: `{agent_dir_path}/skills/`
|
||||
Skills may contain scripts or supporting files. When executing skill scripts with bash, use the real filesystem path:
|
||||
Example: `bash python {agent_dir_path}/skills/web-research/script.py`
|
||||
|
||||
### Human-in-the-Loop Tool Approval
|
||||
|
||||
Some tool calls require user approval before execution. When a tool call is rejected by the user:
|
||||
1. Accept their decision immediately - do NOT retry the same command
|
||||
2. Explain that you understand they rejected the action
|
||||
3. Suggest an alternative approach or ask for clarification
|
||||
4. Never attempt the exact same rejected command again
|
||||
|
||||
Respect the user's decisions and work with them collaboratively.
|
||||
|
||||
### Web Search Tool Usage
|
||||
|
||||
When you use the web_search tool:
|
||||
1. The tool will return search results with titles, URLs, and content excerpts
|
||||
2. You MUST read and process these results, then respond naturally to the user
|
||||
3. NEVER show raw JSON or tool results directly to the user
|
||||
4. Synthesize the information from multiple sources into a coherent answer
|
||||
5. Cite your sources by mentioning page titles or URLs when relevant
|
||||
6. If the search doesn't find what you need, explain what you found and ask clarifying questions
|
||||
|
||||
The user only sees your text responses - not tool results. Always provide a complete, natural language answer after using web_search.
|
||||
|
||||
### Todo List Management
|
||||
|
||||
When using the write_todos tool:
|
||||
1. Keep the todo list MINIMAL - aim for 3-6 items maximum
|
||||
2. Only create todos for complex, multi-step tasks that truly need tracking
|
||||
3. Break down work into clear, actionable items without over-fragmenting
|
||||
4. For simple tasks (1-2 steps), just do them directly without creating todos
|
||||
5. When first creating a todo list for a task, ALWAYS ask the user if the plan looks good before starting work
|
||||
- Create the todos, let them render, then ask: "Does this plan look good?" or similar
|
||||
- Wait for the user's response before marking the first todo as in_progress
|
||||
- If they want changes, adjust the plan accordingly
|
||||
6. Update todo status promptly as you complete each item
|
||||
|
||||
The todo list is a planning tool - use it judiciously to avoid overwhelming the user with excessive task tracking.
|
||||
|
||||
## System Information
|
||||
- **Current User**: {user_identifier}
|
||||
- **Current Time**: {datetime}
|
||||
@ -19,7 +19,6 @@ from .config import huey
|
||||
from utils.file_utils import (
|
||||
extract_zip_file,
|
||||
get_file_hash,
|
||||
is_file_already_processed,
|
||||
load_processed_files_log,
|
||||
save_processed_files_log,
|
||||
get_document_preview
|
||||
|
||||
@ -9,7 +9,6 @@ from .file_utils import (
|
||||
remove_file_or_directory,
|
||||
extract_zip_file,
|
||||
get_document_preview,
|
||||
is_file_already_processed,
|
||||
load_processed_files_log,
|
||||
save_processed_files_log
|
||||
)
|
||||
@ -44,11 +43,6 @@ from .agent_pool import (
|
||||
release_agent_to_pool
|
||||
)
|
||||
|
||||
from .organize_dataset_files import (
|
||||
is_file_already_processed,
|
||||
organize_single_project_files,
|
||||
organize_dataset_files
|
||||
)
|
||||
|
||||
from .api_models import (
|
||||
Message,
|
||||
@ -77,8 +71,6 @@ from .api_models import (
|
||||
|
||||
from .multi_project_manager import (
|
||||
create_robot_project,
|
||||
get_robot_project_info,
|
||||
cleanup_robot_project,
|
||||
get_unique_folder_name,
|
||||
copy_dataset_folder,
|
||||
generate_robot_readme
|
||||
@ -96,7 +88,6 @@ __all__ = [
|
||||
'remove_file_or_directory',
|
||||
'extract_zip_file',
|
||||
'get_document_preview',
|
||||
'is_file_already_processed',
|
||||
'load_processed_files_log',
|
||||
'save_processed_files_log',
|
||||
|
||||
@ -122,10 +113,6 @@ __all__ = [
|
||||
'get_agent_from_pool',
|
||||
'release_agent_to_pool',
|
||||
|
||||
# organize_dataset_files
|
||||
'is_file_already_processed',
|
||||
'organize_single_project_files',
|
||||
'organize_dataset_files',
|
||||
|
||||
# api_models
|
||||
'Message',
|
||||
@ -152,8 +139,6 @@ __all__ = [
|
||||
|
||||
# multi_project_manager
|
||||
'create_robot_project',
|
||||
'get_robot_project_info',
|
||||
'cleanup_robot_project',
|
||||
'get_unique_folder_name',
|
||||
'copy_dataset_folder',
|
||||
'generate_robot_readme',
|
||||
|
||||
@ -373,7 +373,8 @@ def create_project_directory(dataset_ids: Optional[List[str]], bot_id: str, robo
|
||||
|
||||
try:
|
||||
from utils.multi_project_manager import create_robot_project
|
||||
return create_robot_project(dataset_ids, bot_id)
|
||||
from pathlib import Path
|
||||
return create_robot_project(dataset_ids, bot_id, Path("~", ".deepagents"))
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating project directory: {e}")
|
||||
return None
|
||||
|
||||
@ -91,18 +91,6 @@ def get_document_preview(document_path: str, max_lines: int = 10) -> str:
|
||||
return f"Error reading document: {str(e)}"
|
||||
|
||||
|
||||
def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
|
||||
"""Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
|
||||
if not target_file.exists():
|
||||
return False
|
||||
|
||||
# Check if pagination and embeddings files exist and are not empty
|
||||
if pagination_file.exists() and embeddings_file.exists():
|
||||
# Check file sizes to ensure they're not empty
|
||||
if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
|
||||
|
||||
@ -141,7 +141,7 @@ def get_unique_folder_name(target_dir: Path, original_name: str) -> str:
|
||||
counter += 1
|
||||
|
||||
|
||||
def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder_name: str) -> Dict:
|
||||
def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder_name: str, project_path: Path) -> Dict:
|
||||
"""
|
||||
复制单个项目的dataset文件夹到目标目录
|
||||
|
||||
@ -149,6 +149,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
|
||||
source_project_id: 源项目ID
|
||||
target_dataset_dir: 目标dataset目录
|
||||
folder_name: 要复制的文件夹名称
|
||||
project_path: 项目路径
|
||||
|
||||
Returns:
|
||||
Dict: 复制结果
|
||||
@ -163,7 +164,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
|
||||
}
|
||||
|
||||
try:
|
||||
source_folder = Path("projects") / "data" / source_project_id / "dataset" / folder_name
|
||||
source_folder = project_path / "data" / source_project_id / "dataset" / folder_name
|
||||
result["source_path"] = str(source_folder)
|
||||
|
||||
if not source_folder.exists():
|
||||
@ -190,7 +191,7 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
|
||||
return result
|
||||
|
||||
|
||||
def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: List[Dict]) -> str:
|
||||
def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: List[Dict], project_path: Path) -> str:
|
||||
"""
|
||||
生成机器人项目的README.md文件
|
||||
|
||||
@ -202,10 +203,10 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L
|
||||
Returns:
|
||||
str: README.md文件路径
|
||||
"""
|
||||
readme_path = Path("projects") / "robot" / robot_id / "README.md"
|
||||
readme_path = project_path / "robot" / robot_id / "README.md"
|
||||
readme_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
robot_dir = Path("projects") / "robot" / robot_id
|
||||
robot_dir = project_path / "robot" / robot_id
|
||||
|
||||
# 统计信息
|
||||
total_folders = len(copy_results)
|
||||
@ -300,7 +301,7 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L
|
||||
return str(readme_path)
|
||||
|
||||
|
||||
def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
|
||||
def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str, project_path: Path) -> bool:
|
||||
"""
|
||||
检查是否需要重建机器人项目
|
||||
1. 检查机器人项目是否存在
|
||||
@ -310,11 +311,12 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
|
||||
Args:
|
||||
dataset_ids: 源项目ID列表
|
||||
bot_id: 机器人ID
|
||||
project_path: 项目路径
|
||||
|
||||
Returns:
|
||||
bool: 是否需要重建
|
||||
"""
|
||||
robot_dir = Path("projects") / "robot" / bot_id
|
||||
robot_dir = project_path / "robot" / bot_id
|
||||
|
||||
# 如果机器人项目不存在,需要创建
|
||||
if not robot_dir.exists():
|
||||
@ -356,7 +358,7 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
|
||||
|
||||
# 检查每个源项目的processing_log.json文件
|
||||
for source_project_id in dataset_ids:
|
||||
log_file = Path("projects") / "data" / source_project_id / "processing_log.json"
|
||||
log_file = project_path / "data" / source_project_id / "processing_log.json"
|
||||
|
||||
if not log_file.exists():
|
||||
logger.info(f"Processing log file not found for project {source_project_id}, will rebuild")
|
||||
@ -373,7 +375,7 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: bool = False) -> str:
|
||||
def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: bool = False, project_path: Path = Path("projects")) -> str:
|
||||
"""
|
||||
创建机器人项目,合并多个源项目的dataset文件夹
|
||||
|
||||
@ -386,15 +388,15 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
||||
str: 机器人项目目录路径
|
||||
"""
|
||||
logger.info(f"Creating robot project: {bot_id} from sources: {dataset_ids}")
|
||||
|
||||
|
||||
# 检查是否需要重建
|
||||
if not force_rebuild and not should_rebuild_robot_project(dataset_ids, bot_id):
|
||||
robot_dir = Path("projects") / "robot" / bot_id
|
||||
if not force_rebuild and not should_rebuild_robot_project(dataset_ids, bot_id, project_path):
|
||||
robot_dir = project_path / "robot" / bot_id
|
||||
logger.info(f"Using existing robot project: {robot_dir}")
|
||||
return str(robot_dir)
|
||||
|
||||
# 创建机器人目录结构
|
||||
robot_dir = Path("projects") / "robot" / bot_id
|
||||
robot_dir = project_path / "robot" / bot_id
|
||||
dataset_dir = robot_dir / "dataset"
|
||||
|
||||
# 清理已存在的目录(如果需要)
|
||||
@ -411,7 +413,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
||||
for source_project_id in dataset_ids:
|
||||
logger.info(f"\nProcessing source project: {source_project_id}")
|
||||
|
||||
source_dataset_dir = Path("projects") / "data" / source_project_id / "dataset"
|
||||
source_dataset_dir = project_path / "data" / source_project_id / "dataset"
|
||||
|
||||
if not source_dataset_dir.exists():
|
||||
logger.warning(f" Warning: Dataset directory not found for project {source_project_id}")
|
||||
@ -426,7 +428,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
||||
|
||||
# 复制每个文件夹
|
||||
for folder in folders:
|
||||
result = copy_dataset_folder(source_project_id, dataset_dir, folder.name)
|
||||
result = copy_dataset_folder(source_project_id, dataset_dir, folder.name, project_path)
|
||||
copy_results.append(result)
|
||||
|
||||
# 保存配置信息
|
||||
@ -442,7 +444,7 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
||||
json.dump(config_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# 生成README
|
||||
readme_path = generate_robot_readme(bot_id, dataset_ids, copy_results)
|
||||
readme_path = generate_robot_readme(bot_id, dataset_ids, copy_results, project_path)
|
||||
|
||||
# 统计信息
|
||||
successful_copies = sum(1 for r in copy_results if r["success"])
|
||||
@ -456,78 +458,6 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
||||
return str(robot_dir)
|
||||
|
||||
|
||||
def get_robot_project_info(bot_id: str) -> Dict:
|
||||
"""
|
||||
获取机器人项目信息
|
||||
|
||||
Args:
|
||||
bot_id: 机器人ID
|
||||
|
||||
Returns:
|
||||
Dict: 机器人项目信息
|
||||
"""
|
||||
robot_dir = Path("projects") / "robot" / bot_id
|
||||
|
||||
if not robot_dir.exists():
|
||||
return {
|
||||
"exists": False,
|
||||
"bot_id": bot_id,
|
||||
"error": "Robot project does not exist"
|
||||
}
|
||||
|
||||
dataset_dir = robot_dir / "dataset"
|
||||
readme_path = robot_dir / "README.md"
|
||||
|
||||
# 统计文件夹数量
|
||||
folder_count = 0
|
||||
total_size = 0
|
||||
if dataset_dir.exists():
|
||||
for item in dataset_dir.iterdir():
|
||||
if item.is_dir():
|
||||
folder_count += 1
|
||||
# 计算文件夹大小
|
||||
for file_path in item.rglob('*'):
|
||||
if file_path.is_file():
|
||||
total_size += file_path.stat().st_size
|
||||
|
||||
return {
|
||||
"exists": True,
|
||||
"bot_id": bot_id,
|
||||
"robot_dir": str(robot_dir),
|
||||
"dataset_dir": str(dataset_dir),
|
||||
"readme_exists": readme_path.exists(),
|
||||
"folder_count": folder_count,
|
||||
"total_size_bytes": total_size,
|
||||
"total_size_mb": round(total_size / (1024 * 1024), 2)
|
||||
}
|
||||
|
||||
|
||||
def cleanup_robot_project(bot_id: str) -> bool:
|
||||
"""
|
||||
清理机器人项目
|
||||
|
||||
Args:
|
||||
bot_id: 机器人ID
|
||||
|
||||
Returns:
|
||||
bool: 清理是否成功
|
||||
"""
|
||||
try:
|
||||
robot_dir = Path("projects") / "robot" / bot_id
|
||||
|
||||
if robot_dir.exists():
|
||||
shutil.rmtree(robot_dir)
|
||||
logger.info(f"Cleaned up robot project: {bot_id}")
|
||||
return True
|
||||
else:
|
||||
logger.info(f"Robot project does not exist: {bot_id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up robot project {bot_id}: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试代码
|
||||
test_dataset_ids = ["test-project-1", "test-project-2"]
|
||||
@ -536,5 +466,3 @@ if __name__ == "__main__":
|
||||
robot_dir = create_robot_project(test_dataset_ids, test_bot_id)
|
||||
logger.info(f"Created robot project at: {robot_dir}")
|
||||
|
||||
info = get_robot_project_info(test_bot_id)
|
||||
logger.info(f"Robot project info: {json.dumps(info, indent=2, ensure_ascii=False)}")
|
||||
|
||||
@ -1,179 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
|
||||
"""Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
|
||||
if not target_file.exists():
|
||||
return False
|
||||
|
||||
# Check if pagination and embeddings files exist and are not empty
|
||||
if pagination_file.exists() and embeddings_file.exists():
|
||||
# Check file sizes to ensure they're not empty
|
||||
if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def organize_single_project_files(unique_id: str, skip_processed=True):
|
||||
"""Organize files for a single project from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
|
||||
|
||||
project_dir = Path("projects") / "data" / unique_id
|
||||
|
||||
if not project_dir.exists():
|
||||
logger.error(f"Project directory not found: {project_dir}")
|
||||
return
|
||||
|
||||
logger.info(f"Organizing files for project: {unique_id} (skip_processed={skip_processed})")
|
||||
|
||||
files_dir = project_dir / "files"
|
||||
dataset_dir = project_dir / "dataset"
|
||||
|
||||
# Check if files directory exists and has files
|
||||
if not files_dir.exists():
|
||||
logger.info(f" No files directory found, skipping...")
|
||||
return
|
||||
|
||||
files = list(files_dir.glob("*"))
|
||||
if not files:
|
||||
logger.info(f" Files directory is empty, skipping...")
|
||||
return
|
||||
|
||||
# Create dataset directory if it doesn't exist
|
||||
dataset_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Copy each file to its own directory
|
||||
for file_path in files:
|
||||
if file_path.is_file():
|
||||
# Get filename without extension as directory name
|
||||
file_name_without_ext = file_path.stem
|
||||
target_dir = dataset_dir / file_name_without_ext
|
||||
target_file = target_dir / "document.txt"
|
||||
pagination_file = target_dir / "pagination.txt"
|
||||
embeddings_file = target_dir / "embedding.pkl"
|
||||
|
||||
# Check if file is already processed
|
||||
if skip_processed and is_file_already_processed(target_file, pagination_file, embeddings_file):
|
||||
logger.info(f" Skipping already processed file: {file_path.name}")
|
||||
continue
|
||||
|
||||
logger.info(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
|
||||
|
||||
# Create target directory
|
||||
target_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Copy and rename file
|
||||
shutil.copy2(str(file_path), str(target_file))
|
||||
|
||||
print(f" Files remain in original location (copied to dataset structure)")
|
||||
|
||||
# Process each document.txt file: split pages and generate embeddings
|
||||
if not skip_processed:
|
||||
import sys
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), 'embedding'))
|
||||
|
||||
from embedding import split_document_by_pages, embed_document
|
||||
|
||||
for file_path in files:
|
||||
if file_path.is_file():
|
||||
file_name_without_ext = file_path.stem
|
||||
target_dir = dataset_dir / file_name_without_ext
|
||||
document_file = target_dir / "document.txt"
|
||||
pagination_file = target_dir / "pagination.txt"
|
||||
embeddings_file = target_dir / "embedding.pkl"
|
||||
|
||||
# Skip if already processed
|
||||
if is_file_already_processed(document_file, pagination_file, embeddings_file):
|
||||
print(f" Skipping document processing for already processed file: {file_path.name}")
|
||||
continue
|
||||
|
||||
# Split document by pages
|
||||
print(f" Splitting pages for {document_file.name}")
|
||||
try:
|
||||
pages = split_document_by_pages(str(document_file), str(pagination_file))
|
||||
print(f" Generated {len(pages)} pages")
|
||||
except Exception as e:
|
||||
print(f" Failed to split pages: {e}")
|
||||
continue
|
||||
|
||||
# Generate embeddings
|
||||
print(f" Generating embeddings for {document_file.name}")
|
||||
try:
|
||||
# Use paragraph chunking strategy with default settings
|
||||
embedding_data = embed_document(
|
||||
str(document_file),
|
||||
str(embeddings_file),
|
||||
chunking_strategy='paragraph'
|
||||
)
|
||||
|
||||
if embedding_data:
|
||||
print(f" Generated embeddings for {len(embedding_data['chunks'])} chunks")
|
||||
else:
|
||||
print(f" Failed to generate embeddings")
|
||||
except Exception as e:
|
||||
print(f" Failed to generate embeddings: {e}")
|
||||
|
||||
print(f" Document processing completed for project {unique_id}")
|
||||
else:
|
||||
print(f" Skipping document processing (skip_processed=True)")
|
||||
|
||||
|
||||
def organize_dataset_files():
|
||||
"""Move files from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
|
||||
|
||||
projects_dir = Path("projects") / "data"
|
||||
|
||||
if not projects_dir.exists():
|
||||
print("Projects directory not found")
|
||||
return
|
||||
|
||||
# Get all project directories (exclude cache and other non-project dirs)
|
||||
project_dirs = [d for d in projects_dir.iterdir()
|
||||
if d.is_dir() and d.name != "_cache" and not d.name.startswith(".")]
|
||||
|
||||
for project_dir in project_dirs:
|
||||
print(f"\nProcessing project: {project_dir.name}")
|
||||
|
||||
files_dir = project_dir / "files"
|
||||
dataset_dir = project_dir / "dataset"
|
||||
|
||||
# Check if files directory exists and has files
|
||||
if not files_dir.exists():
|
||||
logger.info(f" No files directory found, skipping...")
|
||||
continue
|
||||
|
||||
files = list(files_dir.glob("*"))
|
||||
if not files:
|
||||
logger.info(f" Files directory is empty, skipping...")
|
||||
continue
|
||||
|
||||
# Create dataset directory if it doesn't exist
|
||||
dataset_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Move each file to its own directory
|
||||
for file_path in files:
|
||||
if file_path.is_file():
|
||||
# Get filename without extension as directory name
|
||||
file_name_without_ext = file_path.stem
|
||||
target_dir = dataset_dir / file_name_without_ext
|
||||
target_file = target_dir / "document.txt"
|
||||
|
||||
logger.info(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
|
||||
|
||||
# Create target directory
|
||||
target_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Copy and rename file
|
||||
shutil.copy2(str(file_path), str(target_file))
|
||||
|
||||
print(f" Files remain in original location (copied to dataset structure)")
|
||||
|
||||
print("\nFile organization complete!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
organize_dataset_files()
|
||||
@ -54,6 +54,17 @@ def setup_deepagents_symlink():
|
||||
logger.info(f"Removed existing symlink pointing to {target}")
|
||||
|
||||
# Create the symbolic link
|
||||
# Check again before creating to handle race conditions
|
||||
if deepagents_dir.is_symlink() or deepagents_dir.exists():
|
||||
logger.warning(f"Path {deepagents_dir} exists, attempting to remove before symlink")
|
||||
if deepagents_dir.is_symlink():
|
||||
deepagents_dir.unlink()
|
||||
elif deepagents_dir.is_dir():
|
||||
import shutil
|
||||
shutil.rmtree(str(deepagents_dir))
|
||||
else:
|
||||
deepagents_dir.unlink()
|
||||
|
||||
os.symlink(robot_dir, deepagents_dir, target_is_directory=True)
|
||||
logger.info(f"Created symbolic link: {deepagents_dir} -> {robot_dir}")
|
||||
return True
|
||||
|
||||
Loading…
Reference in New Issue
Block a user